annotate mirbase_functions.py @ 39:1bfac419081d draft default tip

Uploaded
author glogobyte
date Tue, 17 Oct 2023 09:02:24 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
39
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
1 import itertools
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
2 import re
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
3 import urllib.request
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
4 import gzip
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
5 import copy
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
6 from collections import OrderedDict
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
7
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
8
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
9
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
10 # Read a file and return it as a list
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
11 def read(path, flag):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
12 if flag == 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
13 with open(path) as fp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
14 file=fp.readlines()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
15 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
16 return file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
17
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
18 if flag == 1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
19 with open(path) as fp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
20 file = fp.read().splitlines()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
21 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
22 return file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
23
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
24 # Write a list to a txt file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
25 def write(path, list):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
26 with open(path,'w') as fp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
27 for x in list:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
28 fp.write(str("\t".join(x[1:-1])))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
29 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
30
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
31
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
32 #################################################################################################################>
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
33
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
34 # Detect the longest common substring sequence between two mirnas
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
35 def longestSubstring(str1, str2):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
36
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
37 from difflib import SequenceMatcher
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
38 # initialize SequenceMatcher object with
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
39 # input string
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
40 seqMatch = SequenceMatcher(None, str1, str2)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
41
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
42 # find match of longest sub-string
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
43 # output will be like Match(a=0, b=0, size=5)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
44 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
45
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
46 # print longest substring
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
47 if (match.size != 0):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
48 return str1[match.a: match.a + match.size]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
49 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
50 print('No longest common sub-string found')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
51
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
52 #################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
53
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
54 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
55
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
56 This function concatenates miRNAs which are generated from different chromosomes
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
57 and eliminates the duplications of miRNAs on every sample
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
58
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
59 input: detected miRNAs
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
60 output: collpased miRNAs without duplicates
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
61
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
62 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
63
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
64
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
65 def remove_duplicates(mirnas):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
66
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
67 # Detection of canonical mirRNAs whicha are generated from different chromosomes
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
68 dupes=[[x[9],x[0],x[2]] for x in mirnas]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
69
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
70 for x in mirnas:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
71 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
72 if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
73 y.append(x[2])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
74
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
75 # Detection of different chromosomes for every miRNA
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
76 chr_order = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
77 for x in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
78 temp = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
79 for i in range(2,len(x)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
80 if x[i].split("chr")[1].split("(")[0].isdigit():
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
81 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
82 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
83 temp.append(x[i].split("chr")[1][0:4])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
84
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
85 for z in temp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
86 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
87 temp = [str(j) for j in temp]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
88 temp = list(set(temp))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
89 temp.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
90 chr_order.append(temp)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
91
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
92 # Collapsing the miRNAs with the same sequence from different chromosomes
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
93 collapsed_dupes=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
94 for i in range(len(dupes)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
95 collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
96 for x in chr_order[i]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
97 chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y'
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
98 if chr_check is not None:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
99 if int(x)<0: # Check the strand (+) or (-)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
100 collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
101 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
102 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
103 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
104 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
105
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
106 # Remove duplicates from collapsed_dupes
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
107 collapsed_dupes.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
108 collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
109
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
110 for i in range(len(mirnas)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
111 for x in collapsed_dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
112
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
113 # Naming of template isomirs (adding positions in the names)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
114 if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
115 gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
116 mirnas[i][2] = x[1]+gg
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
117 break
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
118
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
119 # Naming of canonical miRNAs (collpsed names)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
120 if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
121 mirnas[i][2] = x[1]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
122 break
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
123
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
124 # Remove duplicates
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
125 mirnas.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
126 mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
127
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
128 return mirnas
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
129
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
130 #############################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
131
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
132 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
133
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
134 This function indentifies and classifies the miRNAs which are detected from the alignment tool.
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
135
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
136 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
137
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
138 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
139
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
140 # read the sam file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
141 ini_sam=read(path,0)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
142 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
143 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
144 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
145
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
146 sorted_uni_arms = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
147
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
148 for i in range(0,len(mature_mirnas,),2):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
149 tmp_count_reads = 0 # calculate the total number of reads
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
150 tmp_count_seq = 0 # calculate the total number of sequences
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
151 for j in range(len(unique_seq)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
152
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
153 if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
154 mirna=unique_seq[j][2].split("_")[0][:-4]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
155 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
156 mirna=unique_seq[j][2].split("_")[0]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
157
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
158 # Detection of differences between the canonical miRNA and the detected miRNA
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
159 if mature_mirnas[i].split(" ")[0][1:] == mirna:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
160
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
161 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
162 off_part = longestSubstring(temp_mature, unique_seq[j][9])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
163
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
164 mat_diff = temp_mature.split(off_part)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
165 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
166
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
167 unique_diff = unique_seq[j][9].split(off_part)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
168 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
169
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
170 # Handling of some special mirnas like (hsa-miR-8485)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
171 if mat_diff[1]!=0 and unique_diff[1]!=0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
172 unique_seq[j]=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
173 pre_pos = 0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
174 post_pos = 0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
175
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
176 elif mat_diff[0]!=0 and unique_diff[0]!=0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
177 unique_seq[j]=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
178 pre_pos = 0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
179 post_pos = 0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
180
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
181 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
182 # Keep the findings
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
183 pre_pos = mat_diff[0]-unique_diff[0]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
184 post_pos = unique_diff[1]-mat_diff[1]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
185 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
186 tmp_count_seq = tmp_count_seq+1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
187
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
188 # Store the detected miRNAs with new names according to the findings
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
189 if pre_pos != 0 or post_pos != 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
190 if pre_pos == 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
191 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
192 elif post_pos == 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
193 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
194 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
195 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
196
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
197 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
198 for x in range(unique_seq.count(1)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
199 unique_seq.remove(1)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
200
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
201 # metrics for the production of database
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
202 if tmp_count_reads != 0 and tmp_count_seq != 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
203 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
204
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
205 # Sorting of the metrics for database
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
206 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
207
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
208 # Collapsing of miRNAs and removing of duplicates
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
209 collapsed_mirnas = remove_duplicates(unique_seq)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
210
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
211 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
212 for y in sorted_uni_arms:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
213 counts=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
214 seqs=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
215 for x in collapsed_mirnas:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
216 if y[0] in x[2].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
217 counts+=int(x[0].split("-")[1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
218 seqs+=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
219
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
220 y[1]=seqs
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
221 y[2]=counts
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
222
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
223
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
224 # Output variables
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
225 temp_mirna_names=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
226
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
227 l.acquire()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
228
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
229 if case == "c" or case == "t":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
230 temp_mirna_names.extend(z[2] for z in collapsed_mirnas)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
231 names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
232 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
233 mirna_names.extend(temp_mirna_names)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
234 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
235 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
236 file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
237 samples.append(collapsed_mirnas) # return the processed detected miRNAs
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
238 data.append([case,file,collapsed_mirnas,sorted_uni_arms])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
239 ini_sample.append(filter_sam) # returns the filtered sam file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
240
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
241 l.release()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
242
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
243
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
244 ######################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
245
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
246
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
247 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
248
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
249 Read a sam file from Bowtie and do the followings:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
250
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
251 1) Remove reverse stranded mapped reads
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
252 2) Remove unmapped reads
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
253 3) Remove all sequences with reads less than 11 reads
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
254 4) Sort the arms with the most sequences in decreading rate
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
255 5) Sort the sequences of every arm with the most reads in decreasing rate
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
256 6) Calculate total number of sequences of every arm
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
257 7) Calculate total number of reads of sequences of every arm.
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
258 8) Store all the informations in a txt file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
259
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
260 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
261
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
262 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
263
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
264 # read the sam file
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
265 ini_sam=read(path,0)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
266 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
267 unique_seq=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
268 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
269
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
270 uni_seq=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
271
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
272 # Calculate the shifted positions for every non template mirna and add them to the name of it
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
273 sorted_uni_arms = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
274 for i in range(1,len(mature_mirnas),2):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
275 tmp_count_reads = 0 # calculate the total number of reads
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
276 tmp_count_seq = 0 # calculate the total number of sequences
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
277
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
278 for j in range(len(unique_seq)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
279
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
280 temp_mature = mature_mirnas[i].strip().replace("U", "T")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
281
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
282 # Detection of differences between the canonical miRNA and the detected non template miRNA
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
283 if temp_mature in unique_seq[j][9]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
284
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
285 off_part = longestSubstring(temp_mature, unique_seq[j][9])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
286
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
287 mat_diff = temp_mature.split(off_part)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
288 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
289
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
290 unique_diff = unique_seq[j][9].split(off_part)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
291 if len(unique_diff)<=2:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
292 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
293
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
294 pre_pos = mat_diff[0]-unique_diff[0]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
295 post_pos = unique_diff[1]-mat_diff[1]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
296
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
297 lengthofmir = len(off_part) + post_pos
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
298 if pre_pos == 0 and post_pos<4:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
299 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
300 tmp_count_seq = tmp_count_seq + 1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
301
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
302 t_name=unique_seq[j].copy()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
303 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
304 uni_seq.append(t_name)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
305 # metrics for the production of database
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
306 if tmp_count_reads != 0 and tmp_count_seq != 0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
307 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
308
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
309 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
310 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
311
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
312 # Output variables
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
313 l.acquire()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
314 if case=="c" or case=="t":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
315 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
316 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
317 file_order.append(file)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
318 data.append([case,file,unique_seq,sorted_uni_arms])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
319 l.release()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
320
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
321 #################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
322
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
323 def black_white(mirna_names_1,mirna_names_2,group,manager):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
324
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
325 add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
326 add_names.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
327 add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
328
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
329 group.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
330 group = list(group for group,_ in itertools.groupby(group))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
331
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
332 zeros=["0"]*(len(group[0])-2)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
333 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
334 group=group+add_names
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
335
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
336 manager.extend(group)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
337
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
338 ################################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
339
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
340 def merging_dupes(group,f_dupes):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
341
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
342 dupes=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
343 final_mat =[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
344
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
345 for num,_ in enumerate(group):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
346
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
347 if group[num][1] not in final_mat and group[num][0] not in final_mat:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
348 final_mat.append(group[num][1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
349 final_mat.append(group[num][0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
350 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
351 dupes.append(group[num][1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
352
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
353
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
354 dupes=list(set(dupes))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
355
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
356 dupes=[[x] for x in dupes]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
357
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
358 for x in group:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
359 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
360 if x[1]==y[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
361 fl=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
362 if len(y)==1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
363 y.append(x[0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
364 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
365 for i in range(1,len(y)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
366 if y[i].split("_")[0]==x[0].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
367 fl=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
368 if len(x[0])<len(y[i]):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
369 del y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
370 y.append(x[0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
371 break
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
372
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
373 if fl==0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
374 y.append((x[0]))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
375
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
376 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
377 if len(y)>2:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
378 for i in range(len(y)-1,1,-1):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
379 y[1]=y[1]+"/"+y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
380 del y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
381
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
382 f_dupes.extend(dupes)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
383
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
384 ##########################################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
385
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
386 def apply_merging_dupes(group,dupes,managger):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
387
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
388 for x in group:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
389 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
390 if x[1]==y[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
391 x[0]=y[1]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
392
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
393 group.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
394 group=list(group for group,_ in itertools.groupby(group))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
395 managger.extend(group)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
396
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
397 ###############################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
398
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
399
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
400 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
401
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
402 t_group_new=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
403 c_group_new=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
404
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
405 percent=int(per)/100
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
406 c_col_filter=round(percent*(len(c_group[1])-2))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
407 t_col_filter=round(percent*(len(t_group[1])-2))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
408
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
409 for i, _ in enumerate(c_group):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
410 c_cols=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
411 t_cols=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
412
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
413 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
414 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
415
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
416 if c_cols>=c_col_filter or t_cols>=t_col_filter:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
417 t_group_new.append(t_group[i])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
418 c_group_new.append(c_group[i])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
419
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
420 fil_c_group.extend(c_group_new)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
421 fil_t_group.extend(t_group_new)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
422
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
423 ##################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
424
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
425
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
426 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
427
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
428 if flag == 1 and int(per)!=-1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
429 fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
430 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
431 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
432 for y in tre_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
433 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
434
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
435 for x in fil_tre:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
436 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
437 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
438
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
439 fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
440 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
441 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
442 for y in con_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
443 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
444
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
445 for x in fil_con:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
446 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
447 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
448
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
449
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
450 if flag == 2 and int(per)!=-1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
451 fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
452 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
453 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
454 for y in tre_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
455 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
456
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
457
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
458 for x in fil_tre:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
459 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
460 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
461
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
462 fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
463 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
464 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
465 for y in con_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
466 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
467
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
468 for x in fil_con:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
469 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
470 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
471
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
472
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
473 if flag == 1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
474 fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
475 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
476 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
477 for y in tre_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
478 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
479
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
480 for x in raw_tre:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
481 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
482 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
483
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
484 fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
485 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
486 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
487 for y in con_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
488 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
489
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
490 for x in raw_con:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
491 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
492 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
493
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
494
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
495 if flag == 2:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
496 fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
497 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
498 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
499 for y in tre_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
500 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
501
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
502
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
503 for x in raw_tre:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
504 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
505 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
506
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
507 fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
508 fp.write("Name\t")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
509 fp.write("Sequence")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
510 for y in con_file_order:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
511 fp.write("\t"+y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
512
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
513 for x in raw_con:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
514 fp.write("\n%s" % "\t".join(x))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
515 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
516
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
517
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
518 #########################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
519
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
520 def temp_counts_to_diff(names,samp,folder):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
521
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
522 for i in range(2,len(samp[0])):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
523
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
524 fp = open(folder+names[i-2]+'.txt','w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
525 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
526
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
527 for x in samp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
528 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
529 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
530
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
531 ##################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
532
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
533 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
534
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
535 if f==1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
536 # Write a txt file with all the information
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
537 if con=="c":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
538 fp = open('split1/'+name, 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
539
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
540 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
541 if con=="t":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
542 fp = open('split2/'+name, 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
543 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
544
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
545
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
546 for i in range(len(sorted_uni_arms)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
547 temp = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
548 for j in range(len(unique_seq)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
549
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
550 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
551
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
552 temp.append(unique_seq[j])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
553
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
554 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
555 fp.write("*********************************************************************************************************\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
556 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
557 fp.write("*********************************************************************************************************\n\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
558 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
559 fp.write("\n" + "\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
560 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
561
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
562 if f==2:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
563
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
564 if con=="c":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
565 fp = open('split3/'+name, 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
566 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
567 if con=="t":
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
568 fp = open('split4/'+name, 'w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
569 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
570
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
571
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
572 for i in range(len(sorted_uni_arms)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
573 temp = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
574 for j in range(len(unique_seq)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
575 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
576 temp.append(unique_seq[j])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
577 if temp!=[]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
578 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
579 fp.write("*********************************************************************************************************\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
580 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
581 fp.write("*********************************************************************************************************\n\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
582 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
583 fp.write("\n" + "\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
584 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
585
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
586
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
587 ##########################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
588
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
589 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
590
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
591 unique_iso = []
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
592 for x in pre_unique_seq:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
593 if len(x[2].split("_"))==3:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
594 for y in pre_unique_seq:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
595 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
596 if any(y[2] in lst2 for lst2 in unique_iso)==False:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
597 y[2]=">"+y[2]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
598 unique_iso.append(y)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
599 l.acquire()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
600 for x in unique_iso:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
601 mat_mirnas.append(x[2])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
602 mat_mirnas.append(x[9])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
603 l.release()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
604
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
605 #########################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
606
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
607 def merging_names(ini_mat,new):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
608
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
609 dupes=[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
610 final_mat =[]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
611
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
612 for num in range(len(ini_mat)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
613
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
614 if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
615 final_mat.append(ini_mat[num][1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
616 final_mat.append(ini_mat[num][0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
617 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
618 dupes.append(ini_mat[num][1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
619
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
620 dupes=list(set(dupes))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
621
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
622 for i in range(len(dupes)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
623 dupes[i]=[dupes[i]]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
624
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
625 for x in ini_mat:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
626 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
627 if x[1]==y[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
628 fl=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
629 if len(y)==1:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
630 y.append(x[0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
631 else:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
632 for i in range(1,len(y)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
633 if y[i].split("_")[0]==x[0].split("_")[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
634 fl=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
635 if len(x[0])<len(y[i]):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
636 del y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
637 y.append(x[0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
638 break
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
639
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
640 if fl==0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
641 y.append((x[0]))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
642
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
643 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
644 if len(y)>2:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
645 for i in range(len(y)-1,1,-1):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
646 y[1]=y[1]+"/"+y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
647 del y[i]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
648
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
649
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
650 for x in ini_mat:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
651 for y in dupes:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
652 if x[1]==y[0]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
653 x[0]=y[1]
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
654
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
655 ini_mat.sort()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
656 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
657
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
658 new.extend(ini_mat)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
659
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
660
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
661 ######################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
662
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
663 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
664
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
665 for i in range(2,len(tem_samp[0])):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
666
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
667 fp = open(folder+tem_names[i-2]+'.txt','w')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
668 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
669
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
670 for x in tem_samp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
671 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
672
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
673 for j in range(len(non_names)):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
674 if non_names[j]==tem_names[i-2]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
675 for x in non_samp:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
676 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
677 fp.close()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
678
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
679 ###################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
680
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
681 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
682
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
683 This function downloads all the miRNAs of all the species from MirBase
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
684 and filters them by the requested organism
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
685
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
686 input : Organism
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
687 output: A list with the miRNA sequences in fasta format
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
688
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
689 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
690
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
691 def download_matures(matures,org_name):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
692
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
693 url = 'https://mirbase.org/download/CURRENT/mature.fa'
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
694 data = urllib.request.urlopen(url).read().decode('utf-8')
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
695 file_mirna = data.split("<br>")
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
696 file_mirna = list(map(lambda x: x.replace('&gt;', ''), file_mirna))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
697
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
698 for i in range(0,len(file_mirna)-1,2):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
699
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
700 if org_name in file_mirna[i]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
701 matures.append(">"+file_mirna[i])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
702 matures.append(file_mirna[i+1])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
703
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
704 ###################################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
705
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
706
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
707 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
708
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
709 This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
710 These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
711 mirnas
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
712
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
713 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
714
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
715
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
716 def non_template_ref(c_samples,t_samples,all_isoforms):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
717
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
718 pre_uni_seq_con = list(c_samples)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
719 pre_uni_seq_tre = list(t_samples)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
720
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
721 for x in pre_uni_seq_con:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
722 for y in x:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
723 #if ">"+y[2] not in all_isoforms and ")_" in y[2] :
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
724 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
725 all_isoforms.append(">"+y[2])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
726 all_isoforms.append(y[9])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
727
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
728 for x in pre_uni_seq_tre:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
729 for y in x:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
730 #if ">"+y[2] not in all_isoforms and ")_" in y[2]:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
731 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
732 all_isoforms.append(">"+y[2])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
733 all_isoforms.append(y[9])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
734
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
735 ################################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
736
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
737 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
738
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
739 This function adds the uncommon detected miRNAs among samples.
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
740 As a result all samples will have the same length.
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
741
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
742 """
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
743
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
744 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
745
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
746 for y in mir_names:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
747 flag=0
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
748 for x in sample:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
749 if y[0]==x[0]: # check if miRNA exists in the sample
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
750 flag=1
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
751 break
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
752 if flag==0:
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
753 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
754
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
755 # sorting and remove duplicates
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
756 sample.sort(key=lambda x: x[0])
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
757 sample=list(sample for sample,_ in itertools.groupby(sample))
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
758
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
759 # Return the updated sample
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
760 l.acquire()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
761 new_d.append(sample)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
762 sample_order.append(sample_name)
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
763 l.release()
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
764
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
765 ###############################################################################################################################################################################################
1bfac419081d Uploaded
glogobyte
parents:
diff changeset
766