| 39 | 1 import itertools | 
|  | 2 import re | 
|  | 3 import urllib.request | 
|  | 4 import gzip | 
|  | 5 import copy | 
|  | 6 from collections import OrderedDict | 
|  | 7 | 
|  | 8 | 
|  | 9 | 
|  | 10 # Read a file and return it as a list | 
|  | 11 def read(path, flag): | 
|  | 12     if flag == 0: | 
|  | 13         with open(path) as fp: | 
|  | 14             file=fp.readlines() | 
|  | 15         fp.close() | 
|  | 16         return file | 
|  | 17 | 
|  | 18     if flag == 1: | 
|  | 19         with open(path) as fp: | 
|  | 20             file = fp.read().splitlines() | 
|  | 21         fp.close() | 
|  | 22         return file | 
|  | 23 | 
|  | 24 # Write a list to a txt file | 
|  | 25 def write(path, list): | 
|  | 26     with open(path,'w') as fp: | 
|  | 27         for x in list: | 
|  | 28             fp.write(str("\t".join(x[1:-1]))) | 
|  | 29     fp.close() | 
|  | 30 | 
|  | 31 | 
|  | 32 #################################################################################################################> | 
|  | 33 | 
|  | 34 # Detect the longest common substring sequence between two mirnas | 
|  | 35 def longestSubstring(str1, str2): | 
|  | 36 | 
|  | 37     from difflib import SequenceMatcher | 
|  | 38     # initialize SequenceMatcher object with | 
|  | 39     # input string | 
|  | 40     seqMatch = SequenceMatcher(None, str1, str2) | 
|  | 41 | 
|  | 42     # find match of longest sub-string | 
|  | 43     # output will be like Match(a=0, b=0, size=5) | 
|  | 44     match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | 
|  | 45 | 
|  | 46     # print longest substring | 
|  | 47     if (match.size != 0): | 
|  | 48         return str1[match.a: match.a + match.size] | 
|  | 49     else: | 
|  | 50         print('No longest common sub-string found') | 
|  | 51 | 
|  | 52 ################################################################################################################################################################################################################# | 
|  | 53 | 
|  | 54 """ | 
|  | 55 | 
|  | 56 This function concatenates miRNAs which are generated from different chromosomes | 
|  | 57 and eliminates the duplications of miRNAs on every sample | 
|  | 58 | 
|  | 59 input:  detected miRNAs | 
|  | 60 output: collpased miRNAs without duplicates | 
|  | 61 | 
|  | 62 """ | 
|  | 63 | 
|  | 64 | 
|  | 65 def remove_duplicates(mirnas): | 
|  | 66 | 
|  | 67  # Detection of canonical mirRNAs whicha are generated from different chromosomes | 
|  | 68  dupes=[[x[9],x[0],x[2]] for x in mirnas] | 
|  | 69 | 
|  | 70  for x in mirnas: | 
|  | 71      for y in dupes: | 
|  | 72          if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]: | 
|  | 73             y.append(x[2]) | 
|  | 74 | 
|  | 75  # Detection of different chromosomes for every miRNA | 
|  | 76  chr_order = [] | 
|  | 77  for x in dupes: | 
|  | 78      temp = [] | 
|  | 79      for i in range(2,len(x)): | 
|  | 80          if x[i].split("chr")[1].split("(")[0].isdigit(): | 
|  | 81             temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) | 
|  | 82          else: | 
|  | 83             temp.append(x[i].split("chr")[1][0:4]) | 
|  | 84 | 
|  | 85      for z in temp: | 
|  | 86          if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: | 
|  | 87              temp = [str(j) for j in temp] | 
|  | 88      temp = list(set(temp)) | 
|  | 89      temp.sort() | 
|  | 90      chr_order.append(temp) | 
|  | 91 | 
|  | 92  # Collapsing the miRNAs with the same sequence from different chromosomes | 
|  | 93  collapsed_dupes=[] | 
|  | 94  for i in range(len(dupes)): | 
|  | 95      collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) | 
|  | 96      for x in chr_order[i]: | 
|  | 97          chr_check = re.match("[-+]?\d+$", str(x))	  # check if chromosome is 'X' or 'Y' | 
|  | 98          if chr_check is not None: | 
|  | 99             if int(x)<0:                 # Check the strand (+) or (-) | 
|  | 100                collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" | 
|  | 101             else: | 
|  | 102                collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" | 
|  | 103          else: | 
|  | 104             collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x) | 
|  | 105 | 
|  | 106  # Remove duplicates from collapsed_dupes | 
|  | 107  collapsed_dupes.sort() | 
|  | 108  collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes)) | 
|  | 109 | 
|  | 110  for i in range(len(mirnas)): | 
|  | 111      for x in collapsed_dupes: | 
|  | 112 | 
|  | 113          # Naming of template isomirs (adding positions in the names) | 
|  | 114          if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: | 
|  | 115             gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1]) | 
|  | 116             mirnas[i][2] = x[1]+gg | 
|  | 117             break | 
|  | 118 | 
|  | 119          # Naming of canonical miRNAs (collpsed names) | 
|  | 120          if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: | 
|  | 121             mirnas[i][2] = x[1] | 
|  | 122             break | 
|  | 123 | 
|  | 124  # Remove duplicates | 
|  | 125  mirnas.sort() | 
|  | 126  mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas)) | 
|  | 127 | 
|  | 128  return mirnas | 
|  | 129 | 
|  | 130 ############################################################################################################################################################################################################# | 
|  | 131 | 
|  | 132 """ | 
|  | 133 | 
|  | 134 This function indentifies and classifies the miRNAs which are detected from the alignment tool. | 
|  | 135 | 
|  | 136 """ | 
|  | 137 | 
|  | 138 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): | 
|  | 139 | 
|  | 140     # read the sam file | 
|  | 141     ini_sam=read(path,0) | 
|  | 142     main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]     # remove introduction | 
|  | 143     unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]   # keeps only the functional miRNAs | 
|  | 144     filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]                             # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc) | 
|  | 145 | 
|  | 146     sorted_uni_arms = [] | 
|  | 147 | 
|  | 148     for i in range(0,len(mature_mirnas,),2): | 
|  | 149         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 150         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 151         for j in range(len(unique_seq)): | 
|  | 152 | 
|  | 153             if "{" in unique_seq[j][2].split("_")[0]:           # checks if a miRNA is generated from two different locis on the same chromosome | 
|  | 154                 mirna=unique_seq[j][2].split("_")[0][:-4] | 
|  | 155             else: | 
|  | 156                 mirna=unique_seq[j][2].split("_")[0] | 
|  | 157 | 
|  | 158             # Detection of differences between the canonical miRNA and the detected miRNA | 
|  | 159             if mature_mirnas[i].split(" ")[0][1:] == mirna: | 
|  | 160 | 
|  | 161                 temp_mature = mature_mirnas[i+1].strip().replace("U", "T") | 
|  | 162                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 163 | 
|  | 164                 mat_diff = temp_mature.split(off_part) | 
|  | 165                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 166 | 
|  | 167                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 168                 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 169 | 
|  | 170                 # Handling of some special mirnas like (hsa-miR-8485) | 
|  | 171                 if mat_diff[1]!=0 and unique_diff[1]!=0: | 
|  | 172                     unique_seq[j]=1 | 
|  | 173                     pre_pos = 0 | 
|  | 174                     post_pos = 0 | 
|  | 175 | 
|  | 176                 elif mat_diff[0]!=0 and unique_diff[0]!=0: | 
|  | 177                     unique_seq[j]=1 | 
|  | 178                     pre_pos = 0 | 
|  | 179                     post_pos = 0 | 
|  | 180 | 
|  | 181                 else: | 
|  | 182                    # Keep the findings | 
|  | 183                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 184                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 185                    tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 186                    tmp_count_seq = tmp_count_seq+1 | 
|  | 187 | 
|  | 188                 # Store the detected miRNAs with new names according to the findings | 
|  | 189                 if pre_pos != 0 or post_pos != 0: | 
|  | 190                     if pre_pos == 0: | 
|  | 191                         unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | 
|  | 192                     elif post_pos == 0: | 
|  | 193                         unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | 
|  | 194                     else: | 
|  | 195                         unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | 
|  | 196 | 
|  | 197         # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) | 
|  | 198         for x in range(unique_seq.count(1)): | 
|  | 199             unique_seq.remove(1) | 
|  | 200 | 
|  | 201         # metrics for the production of database | 
|  | 202         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 203            sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | 
|  | 204 | 
|  | 205     # Sorting of the metrics for database | 
|  | 206     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 207 | 
|  | 208     # Collapsing of miRNAs and removing of duplicates | 
|  | 209     collapsed_mirnas = remove_duplicates(unique_seq) | 
|  | 210 | 
|  | 211     # Correction of metrics due to the collapsing and removing of duplicates for the production of Database | 
|  | 212     for y in sorted_uni_arms: | 
|  | 213        counts=0 | 
|  | 214        seqs=0 | 
|  | 215        for x in collapsed_mirnas: | 
|  | 216            if y[0] in x[2].split("_")[0]: | 
|  | 217               counts+=int(x[0].split("-")[1]) | 
|  | 218               seqs+=1 | 
|  | 219 | 
|  | 220        y[1]=seqs | 
|  | 221        y[2]=counts | 
|  | 222 | 
|  | 223 | 
|  | 224     # Output variables | 
|  | 225     temp_mirna_names=[] | 
|  | 226 | 
|  | 227     l.acquire() | 
|  | 228 | 
|  | 229     if case == "c" or case == "t": | 
|  | 230        temp_mirna_names.extend(z[2] for z in collapsed_mirnas) | 
|  | 231        names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas]) | 
|  | 232        deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas]) | 
|  | 233        mirna_names.extend(temp_mirna_names) | 
|  | 234        unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])     # Keeps the unmap unique sequences for the production of a graph | 
|  | 235        unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])    # Keeps the unmap counts of sequences for the production of a graph | 
|  | 236        file_order.append(file)    #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing) | 
|  | 237        samples.append(collapsed_mirnas)         # return the processed detected miRNAs | 
|  | 238        data.append([case,file,collapsed_mirnas,sorted_uni_arms]) | 
|  | 239        ini_sample.append(filter_sam)    # returns the filtered sam file | 
|  | 240 | 
|  | 241     l.release() | 
|  | 242 | 
|  | 243 | 
|  | 244 ###################################################################################################################################### | 
|  | 245 | 
|  | 246 | 
|  | 247 """ | 
|  | 248 | 
|  | 249 Read a sam file from Bowtie and do the followings: | 
|  | 250 | 
|  | 251 1) Remove reverse stranded mapped reads | 
|  | 252 2) Remove unmapped reads | 
|  | 253 3) Remove all sequences with reads less than 11 reads | 
|  | 254 4) Sort the arms with the most sequences in decreading rate | 
|  | 255 5) Sort the sequences of every arm with the most reads in decreasing rate | 
|  | 256 6) Calculate total number of sequences of every arm | 
|  | 257 7) Calculate total number of reads of sequences of every arm. | 
|  | 258 8) Store all the informations in a txt file | 
|  | 259 | 
|  | 260 """ | 
|  | 261 | 
|  | 262 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): | 
|  | 263 | 
|  | 264     # read the sam file | 
|  | 265     ini_sam=read(path,0) | 
|  | 266     main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 267     unique_seq=[] | 
|  | 268     unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | 
|  | 269 | 
|  | 270     uni_seq=[] | 
|  | 271 | 
|  | 272     # Calculate the shifted positions for every non template mirna and add them to the name of it | 
|  | 273     sorted_uni_arms = [] | 
|  | 274     for i in range(1,len(mature_mirnas),2): | 
|  | 275         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 276         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 277 | 
|  | 278         for j in range(len(unique_seq)): | 
|  | 279 | 
|  | 280             temp_mature = mature_mirnas[i].strip().replace("U", "T") | 
|  | 281 | 
|  | 282             # Detection of differences between the canonical miRNA and the detected non template miRNA | 
|  | 283             if temp_mature in unique_seq[j][9]: | 
|  | 284 | 
|  | 285                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 286 | 
|  | 287                 mat_diff = temp_mature.split(off_part) | 
|  | 288                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 289 | 
|  | 290                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 291                 if len(unique_diff)<=2: | 
|  | 292                    unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 293 | 
|  | 294                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 295                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 296 | 
|  | 297                    lengthofmir = len(off_part) + post_pos | 
|  | 298                    if pre_pos == 0 and post_pos<4: | 
|  | 299                       tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 300                       tmp_count_seq = tmp_count_seq + 1 | 
|  | 301 | 
|  | 302                       t_name=unique_seq[j].copy() | 
|  | 303                       t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | 
|  | 304                       uni_seq.append(t_name) | 
|  | 305         # metrics for the production of database | 
|  | 306         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 307             sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | 
|  | 308 | 
|  | 309     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 310     unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | 
|  | 311 | 
|  | 312     # Output variables | 
|  | 313     l.acquire() | 
|  | 314     if case=="c" or case=="t": | 
|  | 315        names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) | 
|  | 316        n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | 
|  | 317        file_order.append(file) | 
|  | 318        data.append([case,file,unique_seq,sorted_uni_arms]) | 
|  | 319     l.release() | 
|  | 320 | 
|  | 321 ################################################################################################################################################################################################################# | 
|  | 322 | 
|  | 323 def black_white(mirna_names_1,mirna_names_2,group,manager): | 
|  | 324 | 
|  | 325     add_names = [x for x in mirna_names_1 if x not in mirna_names_2] | 
|  | 326     add_names.sort() | 
|  | 327     add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) | 
|  | 328 | 
|  | 329     group.sort() | 
|  | 330     group = list(group for group,_ in itertools.groupby(group)) | 
|  | 331 | 
|  | 332     zeros=["0"]*(len(group[0])-2) | 
|  | 333     [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] | 
|  | 334     group=group+add_names | 
|  | 335 | 
|  | 336     manager.extend(group) | 
|  | 337 | 
|  | 338 ################################################################################################################################################################################################################################ | 
|  | 339 | 
|  | 340 def merging_dupes(group,f_dupes): | 
|  | 341 | 
|  | 342     dupes=[] | 
|  | 343     final_mat =[] | 
|  | 344 | 
|  | 345     for num,_ in enumerate(group): | 
|  | 346 | 
|  | 347         if group[num][1] not in final_mat and group[num][0] not in final_mat: | 
|  | 348            final_mat.append(group[num][1]) | 
|  | 349            final_mat.append(group[num][0]) | 
|  | 350         else: | 
|  | 351            dupes.append(group[num][1]) | 
|  | 352 | 
|  | 353 | 
|  | 354     dupes=list(set(dupes)) | 
|  | 355 | 
|  | 356     dupes=[[x] for x in dupes] | 
|  | 357 | 
|  | 358     for x in group: | 
|  | 359         for y in dupes: | 
|  | 360             if x[1]==y[0]: | 
|  | 361                fl=0 | 
|  | 362                if len(y)==1: | 
|  | 363                   y.append(x[0]) | 
|  | 364                else: | 
|  | 365                   for i in range(1,len(y)): | 
|  | 366                       if y[i].split("_")[0]==x[0].split("_")[0]: | 
|  | 367                          fl=1 | 
|  | 368                          if len(x[0])<len(y[i]): | 
|  | 369                             del y[i] | 
|  | 370                             y.append(x[0]) | 
|  | 371                             break | 
|  | 372 | 
|  | 373                   if fl==0: | 
|  | 374                      y.append((x[0])) | 
|  | 375 | 
|  | 376     for y in dupes: | 
|  | 377         if len(y)>2: | 
|  | 378            for i in range(len(y)-1,1,-1): | 
|  | 379                y[1]=y[1]+"/"+y[i] | 
|  | 380                del y[i] | 
|  | 381 | 
|  | 382     f_dupes.extend(dupes) | 
|  | 383 | 
|  | 384 ########################################################################################################################################################################################################################################## | 
|  | 385 | 
|  | 386 def apply_merging_dupes(group,dupes,managger): | 
|  | 387 | 
|  | 388     for x in group: | 
|  | 389      for y in dupes: | 
|  | 390          if x[1]==y[0]: | 
|  | 391             x[0]=y[1] | 
|  | 392 | 
|  | 393     group.sort() | 
|  | 394     group=list(group for group,_ in itertools.groupby(group)) | 
|  | 395     managger.extend(group) | 
|  | 396 | 
|  | 397 ############################################################################################################################################################################################################################### | 
|  | 398 | 
|  | 399 | 
|  | 400 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): | 
|  | 401 | 
|  | 402     t_group_new=[] | 
|  | 403     c_group_new=[] | 
|  | 404 | 
|  | 405     percent=int(per)/100 | 
|  | 406     c_col_filter=round(percent*(len(c_group[1])-2)) | 
|  | 407     t_col_filter=round(percent*(len(t_group[1])-2)) | 
|  | 408 | 
|  | 409     for i, _ in enumerate(c_group): | 
|  | 410         c_cols=0 | 
|  | 411         t_cols=0 | 
|  | 412 | 
|  | 413         c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) | 
|  | 414         t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) | 
|  | 415 | 
|  | 416         if c_cols>=c_col_filter or t_cols>=t_col_filter: | 
|  | 417            t_group_new.append(t_group[i]) | 
|  | 418            c_group_new.append(c_group[i]) | 
|  | 419 | 
|  | 420     fil_c_group.extend(c_group_new) | 
|  | 421     fil_t_group.extend(t_group_new) | 
|  | 422 | 
|  | 423 ################################################################################################################################################################################################################## | 
|  | 424 | 
|  | 425 | 
|  | 426 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per): | 
|  | 427 | 
|  | 428  if flag == 1 and int(per)!=-1: | 
|  | 429     fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w') | 
|  | 430     fp.write("Name\t") | 
|  | 431     fp.write("Sequence") | 
|  | 432     for y in tre_file_order: | 
|  | 433        fp.write("\t"+y) | 
|  | 434 | 
|  | 435     for x in fil_tre: | 
|  | 436         fp.write("\n%s" % "\t".join(x)) | 
|  | 437     fp.close() | 
|  | 438 | 
|  | 439     fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w') | 
|  | 440     fp.write("Name\t") | 
|  | 441     fp.write("Sequence") | 
|  | 442     for y in con_file_order: | 
|  | 443        fp.write("\t"+y) | 
|  | 444 | 
|  | 445     for x in fil_con: | 
|  | 446         fp.write("\n%s" % "\t".join(x)) | 
|  | 447     fp.close() | 
|  | 448 | 
|  | 449 | 
|  | 450  if flag == 2 and int(per)!=-1: | 
|  | 451     fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w') | 
|  | 452     fp.write("Name\t") | 
|  | 453     fp.write("Sequence") | 
|  | 454     for y in tre_file_order: | 
|  | 455        fp.write("\t"+y) | 
|  | 456 | 
|  | 457 | 
|  | 458     for x in fil_tre: | 
|  | 459         fp.write("\n%s" % "\t".join(x)) | 
|  | 460     fp.close() | 
|  | 461 | 
|  | 462     fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w') | 
|  | 463     fp.write("Name\t") | 
|  | 464     fp.write("Sequence") | 
|  | 465     for y in con_file_order: | 
|  | 466        fp.write("\t"+y) | 
|  | 467 | 
|  | 468     for x in fil_con: | 
|  | 469         fp.write("\n%s" % "\t".join(x)) | 
|  | 470     fp.close() | 
|  | 471 | 
|  | 472 | 
|  | 473  if flag == 1: | 
|  | 474     fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w') | 
|  | 475     fp.write("Name\t") | 
|  | 476     fp.write("Sequence") | 
|  | 477     for y in tre_file_order: | 
|  | 478        fp.write("\t"+y) | 
|  | 479 | 
|  | 480     for x in raw_tre: | 
|  | 481         fp.write("\n%s" % "\t".join(x)) | 
|  | 482     fp.close() | 
|  | 483 | 
|  | 484     fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w') | 
|  | 485     fp.write("Name\t") | 
|  | 486     fp.write("Sequence") | 
|  | 487     for y in con_file_order: | 
|  | 488        fp.write("\t"+y) | 
|  | 489 | 
|  | 490     for x in raw_con: | 
|  | 491         fp.write("\n%s" % "\t".join(x)) | 
|  | 492     fp.close() | 
|  | 493 | 
|  | 494 | 
|  | 495  if flag == 2: | 
|  | 496     fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w') | 
|  | 497     fp.write("Name\t") | 
|  | 498     fp.write("Sequence") | 
|  | 499     for y in tre_file_order: | 
|  | 500        fp.write("\t"+y) | 
|  | 501 | 
|  | 502 | 
|  | 503     for x in raw_tre: | 
|  | 504         fp.write("\n%s" % "\t".join(x)) | 
|  | 505     fp.close() | 
|  | 506 | 
|  | 507     fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w') | 
|  | 508     fp.write("Name\t") | 
|  | 509     fp.write("Sequence") | 
|  | 510     for y in con_file_order: | 
|  | 511        fp.write("\t"+y) | 
|  | 512 | 
|  | 513     for x in raw_con: | 
|  | 514         fp.write("\n%s" % "\t".join(x)) | 
|  | 515     fp.close() | 
|  | 516 | 
|  | 517 | 
|  | 518 ######################################################################################################################################### | 
|  | 519 | 
|  | 520 def temp_counts_to_diff(names,samp,folder): | 
|  | 521 | 
|  | 522     for i in range(2,len(samp[0])): | 
|  | 523 | 
|  | 524        fp = open(folder+names[i-2]+'.txt','w') | 
|  | 525        fp.write("miRNA id"+"\t"+names[i-2]+"\n") | 
|  | 526 | 
|  | 527        for x in samp: | 
|  | 528            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 529        fp.close() | 
|  | 530 | 
|  | 531 ################################################################################################################## | 
|  | 532 | 
|  | 533 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | 
|  | 534 | 
|  | 535  if f==1: | 
|  | 536     # Write a txt file with all the information | 
|  | 537     if con=="c": | 
|  | 538        fp = open('split1/'+name, 'w') | 
|  | 539 | 
|  | 540        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 541     if con=="t": | 
|  | 542        fp = open('split2/'+name, 'w') | 
|  | 543        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 544 | 
|  | 545 | 
|  | 546     for i in range(len(sorted_uni_arms)): | 
|  | 547         temp = [] | 
|  | 548         for j in range(len(unique_seq)): | 
|  | 549 | 
|  | 550             if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: | 
|  | 551 | 
|  | 552                 temp.append(unique_seq[j]) | 
|  | 553 | 
|  | 554         temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 555         fp.write("*********************************************************************************************************\n") | 
|  | 556         fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 557         fp.write("*********************************************************************************************************\n\n") | 
|  | 558         [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 559         fp.write("\n" + "\n") | 
|  | 560     fp.close() | 
|  | 561 | 
|  | 562  if f==2: | 
|  | 563 | 
|  | 564     if con=="c": | 
|  | 565        fp = open('split3/'+name, 'w') | 
|  | 566        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 567     if con=="t": | 
|  | 568        fp = open('split4/'+name, 'w') | 
|  | 569        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 570 | 
|  | 571 | 
|  | 572     for i in range(len(sorted_uni_arms)): | 
|  | 573         temp = [] | 
|  | 574         for j in range(len(unique_seq)): | 
|  | 575                if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: | 
|  | 576                   temp.append(unique_seq[j]) | 
|  | 577         if temp!=[]: | 
|  | 578            temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 579            fp.write("*********************************************************************************************************\n") | 
|  | 580            fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 581            fp.write("*********************************************************************************************************\n\n") | 
|  | 582            [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 583            fp.write("\n" + "\n") | 
|  | 584     fp.close() | 
|  | 585 | 
|  | 586 | 
|  | 587 ########################################################################################################################## | 
|  | 588 | 
|  | 589 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | 
|  | 590 | 
|  | 591     unique_iso = [] | 
|  | 592     for x in pre_unique_seq: | 
|  | 593        if len(x[2].split("_"))==3: | 
|  | 594           for y in pre_unique_seq: | 
|  | 595               if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | 
|  | 596                  if any(y[2] in lst2 for lst2 in unique_iso)==False: | 
|  | 597                     y[2]=">"+y[2] | 
|  | 598                     unique_iso.append(y) | 
|  | 599     l.acquire() | 
|  | 600     for x in unique_iso: | 
|  | 601         mat_mirnas.append(x[2]) | 
|  | 602         mat_mirnas.append(x[9]) | 
|  | 603     l.release() | 
|  | 604 | 
|  | 605 ######################################################################################################################### | 
|  | 606 | 
|  | 607 def merging_names(ini_mat,new): | 
|  | 608 | 
|  | 609     dupes=[] | 
|  | 610     final_mat =[] | 
|  | 611 | 
|  | 612     for num in range(len(ini_mat)): | 
|  | 613 | 
|  | 614         if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat: | 
|  | 615            final_mat.append(ini_mat[num][1]) | 
|  | 616            final_mat.append(ini_mat[num][0]) | 
|  | 617         else: | 
|  | 618            dupes.append(ini_mat[num][1]) | 
|  | 619 | 
|  | 620     dupes=list(set(dupes)) | 
|  | 621 | 
|  | 622     for i in range(len(dupes)): | 
|  | 623         dupes[i]=[dupes[i]] | 
|  | 624 | 
|  | 625     for x in ini_mat: | 
|  | 626         for y in dupes: | 
|  | 627             if x[1]==y[0]: | 
|  | 628                fl=0 | 
|  | 629                if len(y)==1: | 
|  | 630                   y.append(x[0]) | 
|  | 631                else: | 
|  | 632                   for i in range(1,len(y)): | 
|  | 633                       if y[i].split("_")[0]==x[0].split("_")[0]: | 
|  | 634                          fl=1 | 
|  | 635                          if len(x[0])<len(y[i]): | 
|  | 636                             del y[i] | 
|  | 637                             y.append(x[0]) | 
|  | 638                             break | 
|  | 639 | 
|  | 640                   if fl==0: | 
|  | 641                      y.append((x[0])) | 
|  | 642 | 
|  | 643     for y in dupes: | 
|  | 644         if len(y)>2: | 
|  | 645            for i in range(len(y)-1,1,-1): | 
|  | 646                y[1]=y[1]+"/"+y[i] | 
|  | 647                del y[i] | 
|  | 648 | 
|  | 649 | 
|  | 650     for x in ini_mat: | 
|  | 651         for y in dupes: | 
|  | 652             if x[1]==y[0]: | 
|  | 653                x[0]=y[1] | 
|  | 654 | 
|  | 655     ini_mat.sort() | 
|  | 656     ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) | 
|  | 657 | 
|  | 658     new.extend(ini_mat) | 
|  | 659 | 
|  | 660 | 
|  | 661 ###################################################################################################################################################### | 
|  | 662 | 
|  | 663 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder): | 
|  | 664 | 
|  | 665     for i in range(2,len(tem_samp[0])): | 
|  | 666 | 
|  | 667        fp = open(folder+tem_names[i-2]+'.txt','w') | 
|  | 668        fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | 
|  | 669 | 
|  | 670        for x in tem_samp: | 
|  | 671            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 672 | 
|  | 673        for j in range(len(non_names)): | 
|  | 674            if non_names[j]==tem_names[i-2]: | 
|  | 675               for x in non_samp: | 
|  | 676                   fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | 
|  | 677        fp.close() | 
|  | 678 | 
|  | 679 ################################################################################################################################################################################################################### | 
|  | 680 | 
|  | 681 """ | 
|  | 682 | 
|  | 683 This function downloads all the miRNAs of all the species from MirBase | 
|  | 684 and filters them by the requested organism | 
|  | 685 | 
|  | 686 input : Organism | 
|  | 687 output: A list with the miRNA sequences in fasta format | 
|  | 688 | 
|  | 689 """ | 
|  | 690 | 
|  | 691 def download_matures(matures,org_name): | 
|  | 692 | 
|  | 693     url = 'https://mirbase.org/download/CURRENT/mature.fa' | 
|  | 694     data = urllib.request.urlopen(url).read().decode('utf-8') | 
|  | 695     file_mirna = data.split("<br>") | 
|  | 696     file_mirna = list(map(lambda x: x.replace('>', ''), file_mirna)) | 
|  | 697 | 
|  | 698     for i in range(0,len(file_mirna)-1,2): | 
|  | 699 | 
|  | 700         if org_name in file_mirna[i]: | 
|  | 701            matures.append(">"+file_mirna[i]) | 
|  | 702            matures.append(file_mirna[i+1]) | 
|  | 703 | 
|  | 704 ################################################################################################################################################################################################################### | 
|  | 705 | 
|  | 706 | 
|  | 707 """ | 
|  | 708 | 
|  | 709 This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis | 
|  | 710 These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template | 
|  | 711 mirnas | 
|  | 712 | 
|  | 713 """ | 
|  | 714 | 
|  | 715 | 
|  | 716 def non_template_ref(c_samples,t_samples,all_isoforms): | 
|  | 717 | 
|  | 718   pre_uni_seq_con = list(c_samples) | 
|  | 719   pre_uni_seq_tre = list(t_samples) | 
|  | 720 | 
|  | 721   for x in pre_uni_seq_con: | 
|  | 722       for y in x: | 
|  | 723           #if ">"+y[2] not in all_isoforms and ")_" in y[2] : | 
|  | 724            if ">"+y[2] not in all_isoforms and "_t_" in y[2] : | 
|  | 725              all_isoforms.append(">"+y[2]) | 
|  | 726              all_isoforms.append(y[9]) | 
|  | 727 | 
|  | 728   for x in pre_uni_seq_tre: | 
|  | 729       for y in x: | 
|  | 730           #if ">"+y[2] not in all_isoforms and ")_" in y[2]: | 
|  | 731            if ">"+y[2] not in all_isoforms and "_t_" in y[2] : | 
|  | 732              all_isoforms.append(">"+y[2]) | 
|  | 733              all_isoforms.append(y[9]) | 
|  | 734 | 
|  | 735 ################################################################################################################################################################################################ | 
|  | 736 | 
|  | 737 """ | 
|  | 738 | 
|  | 739 This function adds the uncommon detected miRNAs among samples. | 
|  | 740 As a result all samples will have the same length. | 
|  | 741 | 
|  | 742 """ | 
|  | 743 | 
|  | 744 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): | 
|  | 745 | 
|  | 746     for y in mir_names: | 
|  | 747         flag=0 | 
|  | 748         for x in sample: | 
|  | 749             if y[0]==x[0]: # check if miRNA exists in the sample | 
|  | 750                flag=1 | 
|  | 751                break | 
|  | 752         if flag==0: | 
|  | 753            sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence | 
|  | 754 | 
|  | 755     # sorting and remove duplicates | 
|  | 756     sample.sort(key=lambda x: x[0]) | 
|  | 757     sample=list(sample for sample,_ in itertools.groupby(sample)) | 
|  | 758 | 
|  | 759     # Return the updated sample | 
|  | 760     l.acquire() | 
|  | 761     new_d.append(sample) | 
|  | 762     sample_order.append(sample_name) | 
|  | 763     l.release() | 
|  | 764 | 
|  | 765 ############################################################################################################################################################################################### | 
|  | 766 |