view mirgene_functions.py @ 9:355286aed1a6 draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:08:48 +0000
parents 810e789ffeab
children
line wrap: on
line source

import itertools
import urllib.request
from collections import OrderedDict
import copy

########################################################################################################################################################

""" Read a file and return it as a list """

def read(path, flag):
    if flag == 0:
        with open(path) as fp:
            file=fp.readlines()
        fp.close()
        return file

    if flag == 1:
        with open(path) as fp:
            file = fp.read().splitlines()
        fp.close()
        return file

# Write a list to a txt file
def write(path, list):
    with open(path,'w') as fp:
        for x in list:
            fp.write(str("\t".join(x[1:-1])))
    fp.close()

########################################################################################################################################################

""" Detect the longest common substring sequence between two mirnas """

def longestSubstring(str1, str2):

    from difflib import SequenceMatcher
    # initialize SequenceMatcher object with
    # input string
    seqMatch = SequenceMatcher(None, str1, str2)

    # find match of longest sub-string
    # output will be like Match(a=0, b=0, size=5)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))

    # print longest substring
    if (match.size != 0):
        return str1[match.a: match.a + match.size]
    else:
        print('No longest common sub-string found')


#################################################################################################################################################################

"""

Read the sam files from alignment tool and do the followings:

1) Keep mapped reads
2) Keep all sequences with length between 18 and 26 nucleotides
3) Detects the ref and templated miRNAs
4) Gives names to templated miRNAs based on ref miRNAs

"""

def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):

    # read the sam file
    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
    filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]
    sorted_uni_arms = []

    # Detection of differences between the canonical miRNA and the detected miRNA
    for i in range(len(mature_mirnas)):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences
        for j in range(len(unique_seq)):

         if mature_mirnas[i] == unique_seq[j][2]:

                temp_mature = mature_mirnas[i+1]
                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                # Handling of some special mirnas like (hsa-miR-8485)
                if mat_diff[1]!=0 and unique_diff[1]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                elif mat_diff[0]!=0 and unique_diff[0]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                else:
                   # Keep the findings
                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]
                   tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                   tmp_count_seq = tmp_count_seq+1

                # Store the detected miRNAs with new names according to the findings
                if pre_pos != 0 or post_pos != 0:
                    if pre_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
                    elif post_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
                    else:
                        unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)

        # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
        for x in range(unique_seq.count(1)):
           unique_seq.remove(1)

        # metrics for the production of database
        if tmp_count_reads != 0 and tmp_count_seq != 0:
           sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])

    # Sorting of the metrics for database
    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)

    # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
    for y in sorted_uni_arms:
       counts=0
       seqs=0
       for x in unique_seq:
           if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
              counts+=int(x[0].split("-")[1])
              seqs+=1

       y[1]=seqs
       y[2]=counts

    # Output variables
    temp_mirna_names=[]

    l.acquire()
    if case == "c" or case == "t":
       temp_mirna_names.extend(z[2] for z in unique_seq)
       names_n_seqs.extend([[y[2],y[9]] for y in unique_seq])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
       mirna_names.extend(temp_mirna_names)
       unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])
       file_order.append(file)
       samples.append(unique_seq)
       data.append([case,file,unique_seq,sorted_uni_arms])
       ini_sample.append(filter_sam)
    l.release()


######################################################################################################################################
"""

Read a sam file from Bowtie and do the followings:

1) Keep unmapped reads
2) Keep all sequences with length between 18 and 26 nucleotides
3) Detects the non-template isomirs
4) Gives names to isomir's based on ref miRNAs

"""

def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):

    # read the sam file
    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq=[]
    unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
    uni_seq=[]

    # Calculate the shifted positions for every isomir and add them to the name of it
    sorted_uni_arms = []
    for i in range(1,len(mature_mirnas),2):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences

        for j in range(len(unique_seq)):

            temp_mature = mature_mirnas[i].strip().replace("U", "T")

            # Detection of differences between the canonical miRNA and the detected non template miRNA
            if temp_mature in unique_seq[j][9]:

                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                if len(unique_diff)<=2:
                   unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]

                   lengthofmir = len(off_part) + post_pos
                   if pre_pos == 0 and post_pos<4:
                      tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                      tmp_count_seq = tmp_count_seq + 1

                      t_name=copy.deepcopy(unique_seq[j])
                      t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
                      uni_seq.append(t_name)

        # metrics for the production of database
        if tmp_count_reads != 0 and tmp_count_seq != 0:
            sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])


    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))

    # Output variables

    l.acquire()
    if case == "c" or case == "t":
       names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       file_order.append(file)
       data.append([case,file,unique_seq,sorted_uni_arms])
    l.release()

#################################################################################################################################################################################################################

"""

This function detects the differences between the two groups (control, treated).
Then copy the undetected miRNAs from one group to other and add zeros as counts.
With this way the two groups will have the same number of miRNAs.

"""

def black_white(mirna_names_1,mirna_names_2,group,manager):

    add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
    add_names.sort()
    add_names = list(add_names for add_names,_ in itertools.groupby(add_names))

    group.sort()
    group = list(group for group,_ in itertools.groupby(group))

    zeros=["0"]*(len(group[0])-2)
    [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
    group=group+add_names

    manager.extend(group)

########################################################################################################>

"""

This function collapses the miRNAs with same sequences and different names into one entry
by merging all the different names into one and are separated with the character "/"

"""

def merging_dupes(group,f_dupes):

    dupes=[]
    temp_mat =[]

    for num,_ in enumerate(group):

        if group[num][1] not in temp_mat and group[num][0] not in temp_mat:
           temp_mat.append(group[num][1])
           temp_mat.append(group[num][0])
        else:
           dupes.append(group[num][1])


    dupes=list(set(dupes))

    dupes=[[x] for x in dupes]

    for x in group:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]

    f_dupes.extend(dupes)


########################################################################################################>

"""

This function removes the duplications of sequences based on output from the fuction merging_dupes

"""

def apply_merging_dupes(group,dupes,managger):

    for x in group:
     for y in dupes:
         if x[1]==y[0]:
            x[0]=y[1]

    group.sort()
    group=list(group for group,_ in itertools.groupby(group))
    managger.extend(group)

########################################################################################################>

"""

This function is optional and performs a filter for low counts miRNAs based on
number of counts and the percentage of the samples, according to user preferences

"""

def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):

    t_group_new=[]
    c_group_new=[]

    percent=int(per)/100
    c_col_filter=round(percent*(len(c_group[1])-2))
    t_col_filter=round(percent*(len(t_group[1])-2))

    for i, _ in enumerate(c_group):
        c_cols=0
        t_cols=0

        c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
        t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])

        if c_cols>=c_col_filter or t_cols>=t_col_filter:
           t_group_new.append(t_group[i])
           c_group_new.append(c_group[i])

    fil_c_group.extend(c_group_new)
    fil_t_group.extend(t_group_new)

##################################################################################################################################################################################################################

"""

This function exports the count matrices for every group (controls, treated)
and condition (ref and templated miRNAs, non-templated miRNAs)

"""

def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per):

 if flag == 1 and int(per)!=-1:
    fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)

    for x in fil_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in fil_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2 and int(per)!=-1:
    fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)


    for x in fil_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in fil_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 1:
    fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)

    for x in raw_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in raw_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

 if flag == 2:
    fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in tre_file_order:
       fp.write("\t"+y)


    for x in raw_tre:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in con_file_order:
       fp.write("\t"+y)

    for x in raw_con:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

####################################################################################################################################################################################################################

"""

This function exports the files of the database with all the info
about every type of the detected miRNAs for every sample

"""

def DB_write(con,name,unique_seq,sorted_uni_arms,f):

 if f==1:
    if con=="c":
       fp = open('split1/'+name, 'w')

       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split2/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))

    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):

            if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):

                temp.append(unique_seq[j])

        temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
        fp.write("*********************************************************************************************************\n")
        fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
        fp.write("*********************************************************************************************************\n\n")
        [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
        fp.write("\n" + "\n")
    fp.close()

 if f==2:

    if con=="c":
       fp = open('split3/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split4/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))

    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):
               if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
                  temp.append(unique_seq[j])
        if temp!=[]:
           temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
           fp.write("*********************************************************************************************************\n")
           fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
           fp.write("*********************************************************************************************************\n\n")
           [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
           fp.write("\n" + "\n")
    fp.close()


#########################################################################################################################

"""

This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates

"""

def merging_names(ini_mat,new):

    dupes=[]
    temp_mat =[]

    for num in range(len(ini_mat)):

        if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat:
           temp_mat.append(ini_mat[num][1])
           temp_mat.append(ini_mat[num][0])
        else:
           dupes.append(ini_mat[num][1])

    dupes=list(set(dupes))

    for i in range(len(dupes)):
        dupes[i]=[dupes[i]]

    for x in ini_mat:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]


    for x in ini_mat:
        for y in dupes:
            if x[1]==y[0]:
               x[0]=y[1]

    ini_mat.sort()
    ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
    new.extend(ini_mat)

####################################################################################################################################################################################################################

"""

This function exports the count matrices for differential expresion
if user chose analysis with non-templated miRNAs detection

"""

def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro):

    for i in range(2,len(tem_samp[0])):

       fp = open(folder+tem_names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")

       for x in tem_samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")

       for j in range(len(non_names)):
           if non_names[j]==tem_names[i-2]:
              for x in non_samp:
                  fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
       fp.close()

#################################################################################################################################################################################################################

"""

This function exports the count matrices for differential expresion
if user chose analysis only with templated miRNAs detection

"""

def temp_counts_to_diff(names,samp,folder,pro):

    for i in range(2,len(samp[0])):

       fp = open(folder+names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+names[i-2]+"\n")

       for x in samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
       fp.close()

#################################################################################################################################################################################################################

"""

This function downloads the fasta files from MirGene site
with ref miRNAs and star miRNAs sequences and merges them
into one list

"""

def download_matures(matures,org_name):

    mature_mir=[]

    mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
    star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'

    data = urllib.request.urlopen(mat_url).read()
    file_mirna = data.decode('utf-8')
    mature_mir = file_mirna.split("\n")
    mature_mir = [x.replace(">","") for x in mature_mir]
    del mature_mir[-1]

    data = urllib.request.urlopen(star_url).read()
    file_mirna = data.decode('utf-8')
    star_mir = file_mirna.split("\n")
    star_mir = [x.replace(">","") for x in star_mir]
    del star_mir[-1]

    mature_mir.extend(star_mir)

    for i in range(1,len(mature_mir),2):
        mature_mir[i]=mature_mir[i].replace("U","T")

    matures.extend(mature_mir)

###################################################################################################################

"""

This function detects the templated isoforms from the 1st part of analysis
These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs

"""

def non_template_ref(sc,st,all_isoforms):

  pre_uni_seq_con = list(sc)
  pre_uni_seq_tre = list(st)

  for x in pre_uni_seq_con:
      for y in x:
          if y[2] not in all_isoforms and "_t_" in y[2]:
             all_isoforms.append(y[2])
             all_isoforms.append(y[9])

  for x in pre_uni_seq_tre:
      for y in x:
          if y[2] not in all_isoforms and "_t_" in y[2]:
             all_isoforms.append(y[2])
             all_isoforms.append(y[9])

################################################################################################################################################################################################

"""

This function adds uncommon detected mirnas among the samples with zeros as counts

"""

def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):

    for y in mir_names:
        flag=0
        for x in sample:
            if y[0]==x[0]: # check if miRNA exists in the sample
               flag=1
               break
        if flag==0:
           sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence

    # sorting and remove duplicates
    sample.sort(key=lambda x: x[0])
    sample=list(sample for sample,_ in itertools.groupby(sample))

    # Return the updated sample
    l.acquire()
    new_d.append(sample)
    sample_order.append(sample_name)
    l.release()

###############################################################################################################################################################################################