view mirbase_functions.py @ 7:0e71fc1eaea2 draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:05:56 +0000
parents 47232a73a46b
children 77ba8dde6fb7
line wrap: on
line source

import itertools
import re
import urllib.request
import gzip
import copy
from collections import OrderedDict



# Read a file and return it as a list
def read(path, flag):
    if flag == 0:
        with open(path) as fp:
            file=fp.readlines()
        fp.close()
        return file

    if flag == 1:
        with open(path) as fp:
            file = fp.read().splitlines()
        fp.close()
        return file

# Write a list to a txt file
def write(path, list):
    with open(path,'w') as fp:
        for x in list:
            fp.write(str("\t".join(x[1:-1])))
    fp.close()


#################################################################################################################>

# Detect the longest common substring sequence between two mirnas
def longestSubstring(str1, str2):

    from difflib import SequenceMatcher
    # initialize SequenceMatcher object with
    # input string
    seqMatch = SequenceMatcher(None, str1, str2)

    # find match of longest sub-string
    # output will be like Match(a=0, b=0, size=5)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))

    # print longest substring
    if (match.size != 0):
        return str1[match.a: match.a + match.size]
    else:
        print('No longest common sub-string found')

#################################################################################################################################################################################################################

"""

This function concatenates miRNAs which are generated from different chromosomes
and eliminates the duplications of miRNAs on every sample

input:  detected miRNAs
output: collpased miRNAs without duplicates

"""


def remove_duplicates(mirnas):

 # Detection of canonical mirRNAs whicha are generated from different chromosomes
 dupes=[[x[9],x[0],x[2]] for x in mirnas]

 for x in mirnas:
     for y in dupes:
         if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]:
            y.append(x[2])

 # Detection of different chromosomes for every miRNA
 chr_order = []
 for x in dupes:
     temp = []
     for i in range(2,len(x)):
         if x[i].split("chr")[1].split("(")[0].isdigit():
            temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
         else:
            temp.append(x[i].split("chr")[1][0:4])

     for z in temp:
         if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
             temp = [str(j) for j in temp]
     temp = list(set(temp))
     temp.sort()
     chr_order.append(temp)

 # Collapsing the miRNAs with the same sequence from different chromosomes
 collapsed_dupes=[]
 for i in range(len(dupes)):
     collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
     for x in chr_order[i]:
         chr_check = re.match("[-+]?\d+$", str(x))	  # check if chromosome is 'X' or 'Y'
         if chr_check is not None:
            if int(x)<0:                 # Check the strand (+) or (-)
               collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
            else:
               collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
         else:
            collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x)

 # Remove duplicates from collapsed_dupes
 collapsed_dupes.sort()
 collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes))

 for i in range(len(mirnas)):
     for x in collapsed_dupes:

         # Naming of template isomirs (adding positions in the names)
         if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
            gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1])
            mirnas[i][2] = x[1]+gg
            break

         # Naming of canonical miRNAs (collpsed names)
         if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
            mirnas[i][2] = x[1]
            break

 # Remove duplicates
 mirnas.sort()
 mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas))

 return mirnas

#############################################################################################################################################################################################################

"""

This function indentifies and classifies the miRNAs which are detected from the alignment tool.

"""

def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):

    # read the sam file
    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]     # remove introduction
    unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]   # keeps only the functional miRNAs
    filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]                             # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc)

    sorted_uni_arms = []

    for i in range(0,len(mature_mirnas,),2):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences
        for j in range(len(unique_seq)):

            if "{" in unique_seq[j][2].split("_")[0]:           # checks if a miRNA is generated from two different locis on the same chromosome
                mirna=unique_seq[j][2].split("_")[0][:-4]
            else:
                mirna=unique_seq[j][2].split("_")[0]

            # Detection of differences between the canonical miRNA and the detected miRNA
            if mature_mirnas[i].split(" ")[0][1:] == mirna:

                temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                # Handling of some special mirnas like (hsa-miR-8485)
                if mat_diff[1]!=0 and unique_diff[1]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                elif mat_diff[0]!=0 and unique_diff[0]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                else:
                   # Keep the findings
                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]
                   tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                   tmp_count_seq = tmp_count_seq+1

                # Store the detected miRNAs with new names according to the findings
                if pre_pos != 0 or post_pos != 0:
                    if pre_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
                    elif post_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
                    else:
                        unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)

        # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
        for x in range(unique_seq.count(1)):
            unique_seq.remove(1)

        # metrics for the production of database
        if tmp_count_reads != 0 and tmp_count_seq != 0:
           sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])

    # Sorting of the metrics for database
    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)

    # Collapsing of miRNAs and removing of duplicates
    collapsed_mirnas = remove_duplicates(unique_seq)

    # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
    for y in sorted_uni_arms:
       counts=0
       seqs=0
       for x in collapsed_mirnas:
           if y[0]==x[2].split("_")[0]:
              counts+=int(x[0].split("-")[1])
              seqs+=1

       y[1]=seqs
       y[2]=counts


    # Output variables
    temp_mirna_names=[]

    l.acquire()

    if case == "c" or case == "t":
       temp_mirna_names.extend(z[2] for z in collapsed_mirnas)
       names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas])
       mirna_names.extend(temp_mirna_names)
       unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])     # Keeps the unmap unique sequences for the production of a graph
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])    # Keeps the unmap counts of sequences for the production of a graph
       file_order.append(file)    #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing)
       samples.append(collapsed_mirnas)         # return the processed detected miRNAs
       data.append([case,file,collapsed_mirnas,sorted_uni_arms])
       ini_sample.append(filter_sam)    # returns the filtered sam file

    l.release()


######################################################################################################################################


"""

Read a sam file from Bowtie and do the followings:

1) Remove reverse stranded mapped reads
2) Remove unmapped reads 
3) Remove all sequences with reads less than 11 reads
4) Sort the arms with the most sequences in decreading rate
5) Sort the sequences of every arm with the most reads in decreasing rate
6) Calculate total number of sequences of every arm
7) Calculate total number of reads of sequences of every arm.
8) Store all the informations in a txt file 

"""

def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):

    # read the sam file
    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq=[]
    unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]

    uni_seq=[]

    # Calculate the shifted positions for every non template mirna and add them to the name of it
    sorted_uni_arms = []
    for i in range(1,len(mature_mirnas),2):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences

        for j in range(len(unique_seq)):

            temp_mature = mature_mirnas[i].strip().replace("U", "T")

            # Detection of differences between the canonical miRNA and the detected non template miRNA
            if temp_mature in unique_seq[j][9]:

                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                if len(unique_diff)<=2:
                   unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]

                   lengthofmir = len(off_part) + post_pos
                   if pre_pos == 0 and post_pos<4:
                      tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                      tmp_count_seq = tmp_count_seq + 1

                      t_name=unique_seq[j].copy()
                      t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
                      uni_seq.append(t_name)
        # metrics for the production of database
        if tmp_count_reads != 0 and tmp_count_seq != 0:
            sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])

    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))

    # Output variables
    l.acquire()
    if case=="c" or case=="t":
       names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       file_order.append(file)
       data.append([case,file,unique_seq,sorted_uni_arms])
    l.release()

#################################################################################################################################################################################################################

def black_white(mirna_names_1,mirna_names_2,group,manager):

    add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
    add_names.sort()
    add_names = list(add_names for add_names,_ in itertools.groupby(add_names))

    group.sort()
    group = list(group for group,_ in itertools.groupby(group))

    zeros=["0"]*(len(group[0])-2)
    [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
    group=group+add_names

    manager.extend(group)

################################################################################################################################################################################################################################

def merging_dupes(group,f_dupes):

    dupes=[]
    final_mat =[]

    for num,_ in enumerate(group):

        if group[num][1] not in final_mat and group[num][0] not in final_mat:
           final_mat.append(group[num][1])
           final_mat.append(group[num][0])
        else:
           dupes.append(group[num][1])


    dupes=list(set(dupes))

    dupes=[[x] for x in dupes]

    for x in group:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]==x[0].split("_")[0]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]

    f_dupes.extend(dupes)

##########################################################################################################################################################################################################################################

def apply_merging_dupes(group,dupes,managger):

    for x in group:
     for y in dupes:
         if x[1]==y[0]:
            x[0]=y[1]

    group.sort()
    group=list(group for group,_ in itertools.groupby(group))
    managger.extend(group)

###############################################################################################################################################################################################################################


def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):

    t_group_new=[]
    c_group_new=[]

    percent=int(per)/100
    c_col_filter=round(percent*(len(c_group[1])-2))
    t_col_filter=round(percent*(len(t_group[1])-2))

    for i, _ in enumerate(c_group):
        c_cols=0
        t_cols=0

        c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
        t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])

        if c_cols>=c_col_filter or t_cols>=t_col_filter:
           t_group_new.append(t_group[i])
           c_group_new.append(c_group[i])

    fil_c_group.extend(c_group_new)
    fil_t_group.extend(t_group_new)

##################################################################################################################################################################################################################


def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per):

 if flag == 1 and int(per)!=-1:
    fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)

    for x in fil_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in fil_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2 and int(per)!=-1:
    fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)


    for x in fil_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in fil_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 1:
    fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)

    for x in raw_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in raw_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2:
    fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)


    for x in raw_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in raw_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


#########################################################################################################################################

def temp_counts_to_diff(names,samp,folder):

    for i in range(2,len(samp[0])):

       fp = open(folder+names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+names[i-2]+"\n")

       for x in samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
       fp.close()

##################################################################################################################

def DB_write(con,name,unique_seq,sorted_uni_arms,f):

 if f==1:
    # Write a txt file with all the information
    if con=="c":
       fp = open('split1/'+name, 'w')

       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split2/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):

            if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:

                temp.append(unique_seq[j])

        temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
        fp.write("*********************************************************************************************************\n")
        fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
        fp.write("*********************************************************************************************************\n\n")
        [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
        fp.write("\n" + "\n")
    fp.close()

 if f==2:

    if con=="c":
       fp = open('split3/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split4/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):
               if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
                  temp.append(unique_seq[j])
        if temp!=[]:
           temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
           fp.write("*********************************************************************************************************\n")
           fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
           fp.write("*********************************************************************************************************\n\n")
           [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
           fp.write("\n" + "\n")
    fp.close()


##########################################################################################################################

def new_mat_seq(pre_unique_seq,mat_mirnas,l):

    unique_iso = []
    for x in pre_unique_seq:
       if len(x[2].split("_"))==3:
          for y in pre_unique_seq:
              if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
                 if any(y[2] in lst2 for lst2 in unique_iso)==False:
                    y[2]=">"+y[2]
                    unique_iso.append(y)
    l.acquire()
    for x in unique_iso:
        mat_mirnas.append(x[2])
        mat_mirnas.append(x[9])
    l.release()

#########################################################################################################################

def merging_names(ini_mat,new):

    dupes=[]
    final_mat =[]

    for num in range(len(ini_mat)):

        if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat:
           final_mat.append(ini_mat[num][1])
           final_mat.append(ini_mat[num][0])
        else:
           dupes.append(ini_mat[num][1])

    dupes=list(set(dupes))

    for i in range(len(dupes)):
        dupes[i]=[dupes[i]]

    for x in ini_mat:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]==x[0].split("_")[0]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]


    for x in ini_mat:
        for y in dupes:
            if x[1]==y[0]:
               x[0]=y[1]

    ini_mat.sort()
    ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))

    new.extend(ini_mat)


######################################################################################################################################################

def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder):

    for i in range(2,len(tem_samp[0])):

       fp = open(folder+tem_names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")

       for x in tem_samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")

       for j in range(len(non_names)):
           if non_names[j]==tem_names[i-2]:
              for x in non_samp:
                  fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
       fp.close()

###################################################################################################################################################################################################################

"""

This function downloads all the miRNAs of all the species from MirBase
and filters them by the requested organism

input : Organism
output: A list with the miRNA sequences in fasta format

"""

def download_matures(matures,org_name):

    url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
    data = urllib.request.urlopen(url).read()
    file_mirna = gzip.decompress(data).decode('utf-8')
    file_mirna = file_mirna.split("\n")

    for i in range(0,len(file_mirna)-1,2):

        if org_name in file_mirna[i]:
           matures.append(file_mirna[i])
           matures.append(file_mirna[i+1])

###################################################################################################################################################################################################################


"""

This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis
These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template
mirnas

"""


def non_template_ref(c_samples,t_samples,all_isoforms):

  pre_uni_seq_con = list(c_samples)
  pre_uni_seq_tre = list(t_samples)

  for x in pre_uni_seq_con:
      for y in x:
          #if ">"+y[2] not in all_isoforms and ")_" in y[2] :
           if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])

  for x in pre_uni_seq_tre:
      for y in x:
          #if ">"+y[2] not in all_isoforms and ")_" in y[2]:
           if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])

################################################################################################################################################################################################

"""

This function adds the uncommon detected miRNAs among samples.
As a result all samples will have the same length.

"""

def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):

    for y in mir_names:
        flag=0
        for x in sample:
            if y[0]==x[0]: # check if miRNA exists in the sample
               flag=1
               break
        if flag==0:
           sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence

    # sorting and remove duplicates
    sample.sort(key=lambda x: x[0])
    sample=list(sample for sample,_ in itertools.groupby(sample))

    # Return the updated sample
    l.acquire()
    new_d.append(sample)
    sample_order.append(sample_name)
    l.release()

###############################################################################################################################################################################################