changeset 5:810e789ffeab draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:04:54 +0000
parents f44185f616bc
children d58f050acd18
files mirgene_functions.py
diffstat 1 files changed, 717 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mirgene_functions.py	Wed Oct 13 16:04:54 2021 +0000
@@ -0,0 +1,717 @@
+import itertools
+import urllib.request
+from collections import OrderedDict
+import copy
+
+########################################################################################################################################################
+
+""" Read a file and return it as a list """
+
+def read(path, flag):
+    if flag == 0:
+        with open(path) as fp:
+            file=fp.readlines()
+        fp.close()
+        return file
+
+    if flag == 1:
+        with open(path) as fp:
+            file = fp.read().splitlines()
+        fp.close()
+        return file
+
+# Write a list to a txt file
+def write(path, list):
+    with open(path,'w') as fp:
+        for x in list:
+            fp.write(str("\t".join(x[1:-1])))
+    fp.close()
+
+########################################################################################################################################################
+
+""" Detect the longest common substring sequence between two mirnas """
+
+def longestSubstring(str1, str2):
+
+    from difflib import SequenceMatcher
+    # initialize SequenceMatcher object with
+    # input string
+    seqMatch = SequenceMatcher(None, str1, str2)
+
+    # find match of longest sub-string
+    # output will be like Match(a=0, b=0, size=5)
+    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
+
+    # print longest substring
+    if (match.size != 0):
+        return str1[match.a: match.a + match.size]
+    else:
+        print('No longest common sub-string found')
+
+
+#################################################################################################################################################################
+
+"""
+
+Read the sam files from alignment tool and do the followings:
+
+1) Keep mapped reads
+2) Keep all sequences with length between 18 and 26 nucleotides
+3) Detects the ref and templated miRNAs
+4) Gives names to templated miRNAs based on ref miRNAs
+
+"""
+
+def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
+
+    # read the sam file
+    ini_sam=read(path,0)
+    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
+    unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
+    filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]
+    sorted_uni_arms = []
+
+    # Detection of differences between the canonical miRNA and the detected miRNA
+    for i in range(len(mature_mirnas)):
+        tmp_count_reads = 0   # calculate the total number of reads
+        tmp_count_seq = 0     # calculate the total number of sequences
+        for j in range(len(unique_seq)):
+
+         if mature_mirnas[i] == unique_seq[j][2]:
+
+                temp_mature = mature_mirnas[i+1]
+                off_part = longestSubstring(temp_mature, unique_seq[j][9])
+
+                mat_diff = temp_mature.split(off_part)
+                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
+
+                unique_diff = unique_seq[j][9].split(off_part)
+                unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
+
+                # Handling of some special mirnas like (hsa-miR-8485)
+                if mat_diff[1]!=0 and unique_diff[1]!=0:
+                    unique_seq[j]=1
+                    pre_pos = 0
+                    post_pos = 0
+
+                elif mat_diff[0]!=0 and unique_diff[0]!=0:
+                    unique_seq[j]=1
+                    pre_pos = 0
+                    post_pos = 0
+
+                else:
+                   # Keep the findings
+                   pre_pos = mat_diff[0]-unique_diff[0]
+                   post_pos = unique_diff[1]-mat_diff[1]
+                   tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
+                   tmp_count_seq = tmp_count_seq+1
+
+                # Store the detected miRNAs with new names according to the findings
+                if pre_pos != 0 or post_pos != 0:
+                    if pre_pos == 0:
+                        unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
+                    elif post_pos == 0:
+                        unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
+                    else:
+                        unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
+
+        # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
+        for x in range(unique_seq.count(1)):
+           unique_seq.remove(1)
+
+        # metrics for the production of database
+        if tmp_count_reads != 0 and tmp_count_seq != 0:
+           sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
+
+    # Sorting of the metrics for database
+    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
+
+    # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
+    for y in sorted_uni_arms:
+       counts=0
+       seqs=0
+       for x in unique_seq:
+           if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
+              counts+=int(x[0].split("-")[1])
+              seqs+=1
+
+       y[1]=seqs
+       y[2]=counts
+
+    # Output variables
+    temp_mirna_names=[]
+
+    l.acquire()
+    if case == "c" or case == "t":
+       temp_mirna_names.extend(z[2] for z in unique_seq)
+       names_n_seqs.extend([[y[2],y[9]] for y in unique_seq])
+       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
+       mirna_names.extend(temp_mirna_names)
+       unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])
+       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])
+       file_order.append(file)
+       samples.append(unique_seq)
+       data.append([case,file,unique_seq,sorted_uni_arms])
+       ini_sample.append(filter_sam)
+    l.release()
+
+
+######################################################################################################################################
+"""
+
+Read a sam file from Bowtie and do the followings:
+
+1) Keep unmapped reads
+2) Keep all sequences with length between 18 and 26 nucleotides
+3) Detects the non-template isomirs
+4) Gives names to isomir's based on ref miRNAs
+
+"""
+
+def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
+
+    # read the sam file
+    ini_sam=read(path,0)
+    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
+    unique_seq=[]
+    unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
+    uni_seq=[]
+
+    # Calculate the shifted positions for every isomir and add them to the name of it
+    sorted_uni_arms = []
+    for i in range(1,len(mature_mirnas),2):
+        tmp_count_reads = 0   # calculate the total number of reads
+        tmp_count_seq = 0     # calculate the total number of sequences
+
+        for j in range(len(unique_seq)):
+
+            temp_mature = mature_mirnas[i].strip().replace("U", "T")
+
+            # Detection of differences between the canonical miRNA and the detected non template miRNA
+            if temp_mature in unique_seq[j][9]:
+
+                off_part = longestSubstring(temp_mature, unique_seq[j][9])
+
+                mat_diff = temp_mature.split(off_part)
+                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
+
+                unique_diff = unique_seq[j][9].split(off_part)
+                if len(unique_diff)<=2:
+                   unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
+
+                   pre_pos = mat_diff[0]-unique_diff[0]
+                   post_pos = unique_diff[1]-mat_diff[1]
+
+                   lengthofmir = len(off_part) + post_pos
+                   if pre_pos == 0 and post_pos<4:
+                      tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
+                      tmp_count_seq = tmp_count_seq + 1
+
+                      t_name=copy.deepcopy(unique_seq[j])
+                      t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
+                      uni_seq.append(t_name)
+
+        # metrics for the production of database
+        if tmp_count_reads != 0 and tmp_count_seq != 0:
+            sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
+
+
+    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
+    unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
+
+    # Output variables
+
+    l.acquire()
+    if case == "c" or case == "t":
+       names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
+       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
+       file_order.append(file)
+       data.append([case,file,unique_seq,sorted_uni_arms])
+    l.release()
+
+#################################################################################################################################################################################################################
+
+"""
+
+This function detects the differences between the two groups (control, treated).
+Then copy the undetected miRNAs from one group to other and add zeros as counts.
+With this way the two groups will have the same number of miRNAs.
+
+"""
+
+def black_white(mirna_names_1,mirna_names_2,group,manager):
+
+    add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
+    add_names.sort()
+    add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
+
+    group.sort()
+    group = list(group for group,_ in itertools.groupby(group))
+
+    zeros=["0"]*(len(group[0])-2)
+    [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
+    group=group+add_names
+
+    manager.extend(group)
+
+########################################################################################################>
+
+"""
+
+This function collapses the miRNAs with same sequences and different names into one entry
+by merging all the different names into one and are separated with the character "/"
+
+"""
+
+def merging_dupes(group,f_dupes):
+
+    dupes=[]
+    temp_mat =[]
+
+    for num,_ in enumerate(group):
+
+        if group[num][1] not in temp_mat and group[num][0] not in temp_mat:
+           temp_mat.append(group[num][1])
+           temp_mat.append(group[num][0])
+        else:
+           dupes.append(group[num][1])
+
+
+    dupes=list(set(dupes))
+
+    dupes=[[x] for x in dupes]
+
+    for x in group:
+        for y in dupes:
+            if x[1]==y[0]:
+               fl=0
+               if len(y)==1:
+                  y.append(x[0])
+               else:
+                  for i in range(1,len(y)):
+                      if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
+                         fl=1
+                         if len(x[0])<len(y[i]):
+                            del y[i]
+                            y.append(x[0])
+                            break
+
+                  if fl==0:
+                     y.append((x[0]))
+
+    for y in dupes:
+        if len(y)>2:
+           for i in range(len(y)-1,1,-1):
+               y[1]=y[1]+"/"+y[i]
+               del y[i]
+
+    f_dupes.extend(dupes)
+
+
+########################################################################################################>
+
+"""
+
+This function removes the duplications of sequences based on output from the fuction merging_dupes
+
+"""
+
+def apply_merging_dupes(group,dupes,managger):
+
+    for x in group:
+     for y in dupes:
+         if x[1]==y[0]:
+            x[0]=y[1]
+
+    group.sort()
+    group=list(group for group,_ in itertools.groupby(group))
+    managger.extend(group)
+
+########################################################################################################>
+
+"""
+
+This function is optional and performs a filter for low counts miRNAs based on
+number of counts and the percentage of the samples, according to user preferences
+
+"""
+
+def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
+
+    t_group_new=[]
+    c_group_new=[]
+
+    percent=int(per)/100
+    c_col_filter=round(percent*(len(c_group[1])-2))
+    t_col_filter=round(percent*(len(t_group[1])-2))
+
+    for i, _ in enumerate(c_group):
+        c_cols=0
+        t_cols=0
+
+        c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
+        t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
+
+        if c_cols>=c_col_filter or t_cols>=t_col_filter:
+           t_group_new.append(t_group[i])
+           c_group_new.append(c_group[i])
+
+    fil_c_group.extend(c_group_new)
+    fil_t_group.extend(t_group_new)
+
+##################################################################################################################################################################################################################
+
+"""
+
+This function exports the count matrices for every group (controls, treated)
+and condition (ref and templated miRNAs, non-templated miRNAs)
+
+"""
+
+def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per):
+
+ if flag == 1 and int(per)!=-1:
+    fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in tre_file_order:
+       fp.write("\t"+y)
+
+    for x in fil_tre:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+    fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in con_file_order:
+       fp.write("\t"+y)
+
+    for x in fil_con:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+
+ if flag == 2 and int(per)!=-1:
+    fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in tre_file_order:
+       fp.write("\t"+y)
+
+
+    for x in fil_tre:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+    fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in con_file_order:
+       fp.write("\t"+y)
+
+    for x in fil_con:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+
+ if flag == 1:
+    fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in tre_file_order:
+       fp.write("\t"+y)
+
+    for x in raw_tre:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+    fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in con_file_order:
+       fp.write("\t"+y)
+
+    for x in raw_con:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+ if flag == 2:
+    fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in tre_file_order:
+       fp.write("\t"+y)
+
+
+    for x in raw_tre:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+    fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
+    fp.write("Name\t")
+    fp.write("Sequence")
+    for y in con_file_order:
+       fp.write("\t"+y)
+
+    for x in raw_con:
+        fp.write("\n%s" % "\t".join(x))
+    fp.close()
+
+####################################################################################################################################################################################################################
+
+"""
+
+This function exports the files of the database with all the info
+about every type of the detected miRNAs for every sample
+
+"""
+
+def DB_write(con,name,unique_seq,sorted_uni_arms,f):
+
+ if f==1:
+    if con=="c":
+       fp = open('split1/'+name, 'w')
+
+       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+    if con=="t":
+       fp = open('split2/'+name, 'w')
+       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+
+    for i in range(len(sorted_uni_arms)):
+        temp = []
+        for j in range(len(unique_seq)):
+
+            if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
+
+                temp.append(unique_seq[j])
+
+        temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
+        fp.write("*********************************************************************************************************\n")
+        fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
+        fp.write("*********************************************************************************************************\n\n")
+        [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
+        fp.write("\n" + "\n")
+    fp.close()
+
+ if f==2:
+
+    if con=="c":
+       fp = open('split3/'+name, 'w')
+       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+    if con=="t":
+       fp = open('split4/'+name, 'w')
+       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+
+    for i in range(len(sorted_uni_arms)):
+        temp = []
+        for j in range(len(unique_seq)):
+               if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
+                  temp.append(unique_seq[j])
+        if temp!=[]:
+           temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
+           fp.write("*********************************************************************************************************\n")
+           fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
+           fp.write("*********************************************************************************************************\n\n")
+           [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
+           fp.write("\n" + "\n")
+    fp.close()
+
+
+#########################################################################################################################
+
+"""
+
+This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates
+
+"""
+
+def merging_names(ini_mat,new):
+
+    dupes=[]
+    temp_mat =[]
+
+    for num in range(len(ini_mat)):
+
+        if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat:
+           temp_mat.append(ini_mat[num][1])
+           temp_mat.append(ini_mat[num][0])
+        else:
+           dupes.append(ini_mat[num][1])
+
+    dupes=list(set(dupes))
+
+    for i in range(len(dupes)):
+        dupes[i]=[dupes[i]]
+
+    for x in ini_mat:
+        for y in dupes:
+            if x[1]==y[0]:
+               fl=0
+               if len(y)==1:
+                  y.append(x[0])
+               else:
+                  for i in range(1,len(y)):
+                      if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
+                         fl=1
+                         if len(x[0])<len(y[i]):
+                            del y[i]
+                            y.append(x[0])
+                            break
+
+                  if fl==0:
+                     y.append((x[0]))
+
+    for y in dupes:
+        if len(y)>2:
+           for i in range(len(y)-1,1,-1):
+               y[1]=y[1]+"/"+y[i]
+               del y[i]
+
+
+    for x in ini_mat:
+        for y in dupes:
+            if x[1]==y[0]:
+               x[0]=y[1]
+
+    ini_mat.sort()
+    ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
+    new.extend(ini_mat)
+
+####################################################################################################################################################################################################################
+
+"""
+
+This function exports the count matrices for differential expresion
+if user chose analysis with non-templated miRNAs detection
+
+"""
+
+def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro):
+
+    for i in range(2,len(tem_samp[0])):
+
+       fp = open(folder+tem_names[i-2]+'.txt','w')
+       fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
+
+       for x in tem_samp:
+           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
+
+       for j in range(len(non_names)):
+           if non_names[j]==tem_names[i-2]:
+              for x in non_samp:
+                  fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
+       fp.close()
+
+#################################################################################################################################################################################################################
+
+"""
+
+This function exports the count matrices for differential expresion
+if user chose analysis only with templated miRNAs detection
+
+"""
+
+def temp_counts_to_diff(names,samp,folder,pro):
+
+    for i in range(2,len(samp[0])):
+
+       fp = open(folder+names[i-2]+'.txt','w')
+       fp.write("miRNA id"+"\t"+names[i-2]+"\n")
+
+       for x in samp:
+           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
+       fp.close()
+
+#################################################################################################################################################################################################################
+
+"""
+
+This function downloads the fasta files from MirGene site
+with ref miRNAs and star miRNAs sequences and merges them
+into one list
+
+"""
+
+def download_matures(matures,org_name):
+
+    mature_mir=[]
+
+    mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
+    star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
+
+    data = urllib.request.urlopen(mat_url).read()
+    file_mirna = data.decode('utf-8')
+    mature_mir = file_mirna.split("\n")
+    mature_mir = [x.replace(">","") for x in mature_mir]
+    del mature_mir[-1]
+
+    data = urllib.request.urlopen(star_url).read()
+    file_mirna = data.decode('utf-8')
+    star_mir = file_mirna.split("\n")
+    star_mir = [x.replace(">","") for x in star_mir]
+    del star_mir[-1]
+
+    mature_mir.extend(star_mir)
+
+    for i in range(1,len(mature_mir),2):
+        mature_mir[i]=mature_mir[i].replace("U","T")
+
+    matures.extend(mature_mir)
+
+###################################################################################################################
+
+"""
+
+This function detects the templated isoforms from the 1st part of analysis
+These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs
+
+"""
+
+def non_template_ref(sc,st,all_isoforms):
+
+  pre_uni_seq_con = list(sc)
+  pre_uni_seq_tre = list(st)
+
+  for x in pre_uni_seq_con:
+      for y in x:
+          if y[2] not in all_isoforms and "_t_" in y[2]:
+             all_isoforms.append(y[2])
+             all_isoforms.append(y[9])
+
+  for x in pre_uni_seq_tre:
+      for y in x:
+          if y[2] not in all_isoforms and "_t_" in y[2]:
+             all_isoforms.append(y[2])
+             all_isoforms.append(y[9])
+
+################################################################################################################################################################################################
+
+"""
+
+This function adds uncommon detected mirnas among the samples with zeros as counts
+
+"""
+
+def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
+
+    for y in mir_names:
+        flag=0
+        for x in sample:
+            if y[0]==x[0]: # check if miRNA exists in the sample
+               flag=1
+               break
+        if flag==0:
+           sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
+
+    # sorting and remove duplicates
+    sample.sort(key=lambda x: x[0])
+    sample=list(sample for sample,_ in itertools.groupby(sample))
+
+    # Return the updated sample
+    l.acquire()
+    new_d.append(sample)
+    sample_order.append(sample_name)
+    l.release()
+
+###############################################################################################################################################################################################