Mercurial > repos > glogobyte > isoread
diff mirbase_functions.py @ 2:47232a73a46b draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:11 +0000 |
parents | |
children | 77ba8dde6fb7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mirbase_functions.py Wed Oct 13 16:04:11 2021 +0000 @@ -0,0 +1,766 @@ +import itertools +import re +import urllib.request +import gzip +import copy +from collections import OrderedDict + + + +# Read a file and return it as a list +def read(path, flag): + if flag == 0: + with open(path) as fp: + file=fp.readlines() + fp.close() + return file + + if flag == 1: + with open(path) as fp: + file = fp.read().splitlines() + fp.close() + return file + +# Write a list to a txt file +def write(path, list): + with open(path,'w') as fp: + for x in list: + fp.write(str("\t".join(x[1:-1]))) + fp.close() + + +#################################################################################################################> + +# Detect the longest common substring sequence between two mirnas +def longestSubstring(str1, str2): + + from difflib import SequenceMatcher + # initialize SequenceMatcher object with + # input string + seqMatch = SequenceMatcher(None, str1, str2) + + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + + # print longest substring + if (match.size != 0): + return str1[match.a: match.a + match.size] + else: + print('No longest common sub-string found') + +################################################################################################################################################################################################################# + +""" + +This function concatenates miRNAs which are generated from different chromosomes +and eliminates the duplications of miRNAs on every sample + +input: detected miRNAs +output: collpased miRNAs without duplicates + +""" + + +def remove_duplicates(mirnas): + + # Detection of canonical mirRNAs whicha are generated from different chromosomes + dupes=[[x[9],x[0],x[2]] for x in mirnas] + + for x in mirnas: + for y in dupes: + if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]: + y.append(x[2]) + + # Detection of different chromosomes for every miRNA + chr_order = [] + for x in dupes: + temp = [] + for i in range(2,len(x)): + if x[i].split("chr")[1].split("(")[0].isdigit(): + temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) + else: + temp.append(x[i].split("chr")[1][0:4]) + + for z in temp: + if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: + temp = [str(j) for j in temp] + temp = list(set(temp)) + temp.sort() + chr_order.append(temp) + + # Collapsing the miRNAs with the same sequence from different chromosomes + collapsed_dupes=[] + for i in range(len(dupes)): + collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) + for x in chr_order[i]: + chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y' + if chr_check is not None: + if int(x)<0: # Check the strand (+) or (-) + collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" + else: + collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" + else: + collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x) + + # Remove duplicates from collapsed_dupes + collapsed_dupes.sort() + collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes)) + + for i in range(len(mirnas)): + for x in collapsed_dupes: + + # Naming of template isomirs (adding positions in the names) + if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: + gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1]) + mirnas[i][2] = x[1]+gg + break + + # Naming of canonical miRNAs (collpsed names) + if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: + mirnas[i][2] = x[1] + break + + # Remove duplicates + mirnas.sort() + mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas)) + + return mirnas + +############################################################################################################################################################################################################# + +""" + +This function indentifies and classifies the miRNAs which are detected from the alignment tool. + +""" + +def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): + + # read the sam file + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction + unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs + filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc) + + sorted_uni_arms = [] + + for i in range(0,len(mature_mirnas,),2): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + for j in range(len(unique_seq)): + + if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome + mirna=unique_seq[j][2].split("_")[0][:-4] + else: + mirna=unique_seq[j][2].split("_")[0] + + # Detection of differences between the canonical miRNA and the detected miRNA + if mature_mirnas[i].split(" ")[0][1:] == mirna: + + temp_mature = mature_mirnas[i+1].strip().replace("U", "T") + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + # Handling of some special mirnas like (hsa-miR-8485) + if mat_diff[1]!=0 and unique_diff[1]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + elif mat_diff[0]!=0 and unique_diff[0]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + else: + # Keep the findings + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq+1 + + # Store the detected miRNAs with new names according to the findings + if pre_pos != 0 or post_pos != 0: + if pre_pos == 0: + unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) + elif post_pos == 0: + unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) + else: + unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) + + # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) + for x in range(unique_seq.count(1)): + unique_seq.remove(1) + + # metrics for the production of database + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + + # Sorting of the metrics for database + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + + # Collapsing of miRNAs and removing of duplicates + collapsed_mirnas = remove_duplicates(unique_seq) + + # Correction of metrics due to the collapsing and removing of duplicates for the production of Database + for y in sorted_uni_arms: + counts=0 + seqs=0 + for x in collapsed_mirnas: + if y[0]==x[2].split("_")[0]: + counts+=int(x[0].split("-")[1]) + seqs+=1 + + y[1]=seqs + y[2]=counts + + + # Output variables + temp_mirna_names=[] + + l.acquire() + + if case == "c" or case == "t": + temp_mirna_names.extend(z[2] for z in collapsed_mirnas) + names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas]) + mirna_names.extend(temp_mirna_names) + unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph + file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing) + samples.append(collapsed_mirnas) # return the processed detected miRNAs + data.append([case,file,collapsed_mirnas,sorted_uni_arms]) + ini_sample.append(filter_sam) # returns the filtered sam file + + l.release() + + +###################################################################################################################################### + + +""" + +Read a sam file from Bowtie and do the followings: + +1) Remove reverse stranded mapped reads +2) Remove unmapped reads +3) Remove all sequences with reads less than 11 reads +4) Sort the arms with the most sequences in decreading rate +5) Sort the sequences of every arm with the most reads in decreasing rate +6) Calculate total number of sequences of every arm +7) Calculate total number of reads of sequences of every arm. +8) Store all the informations in a txt file + +""" + +def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): + + # read the sam file + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq=[] + unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] + + uni_seq=[] + + # Calculate the shifted positions for every non template mirna and add them to the name of it + sorted_uni_arms = [] + for i in range(1,len(mature_mirnas),2): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + + for j in range(len(unique_seq)): + + temp_mature = mature_mirnas[i].strip().replace("U", "T") + + # Detection of differences between the canonical miRNA and the detected non template miRNA + if temp_mature in unique_seq[j][9]: + + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + if len(unique_diff)<=2: + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + + lengthofmir = len(off_part) + post_pos + if pre_pos == 0 and post_pos<4: + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq + 1 + + t_name=unique_seq[j].copy() + t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) + uni_seq.append(t_name) + # metrics for the production of database + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) + + # Output variables + l.acquire() + if case=="c" or case=="t": + names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + file_order.append(file) + data.append([case,file,unique_seq,sorted_uni_arms]) + l.release() + +################################################################################################################################################################################################################# + +def black_white(mirna_names_1,mirna_names_2,group,manager): + + add_names = [x for x in mirna_names_1 if x not in mirna_names_2] + add_names.sort() + add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) + + group.sort() + group = list(group for group,_ in itertools.groupby(group)) + + zeros=["0"]*(len(group[0])-2) + [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] + group=group+add_names + + manager.extend(group) + +################################################################################################################################################################################################################################ + +def merging_dupes(group,f_dupes): + + dupes=[] + final_mat =[] + + for num,_ in enumerate(group): + + if group[num][1] not in final_mat and group[num][0] not in final_mat: + final_mat.append(group[num][1]) + final_mat.append(group[num][0]) + else: + dupes.append(group[num][1]) + + + dupes=list(set(dupes)) + + dupes=[[x] for x in dupes] + + for x in group: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + f_dupes.extend(dupes) + +########################################################################################################################################################################################################################################## + +def apply_merging_dupes(group,dupes,managger): + + for x in group: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + group.sort() + group=list(group for group,_ in itertools.groupby(group)) + managger.extend(group) + +############################################################################################################################################################################################################################### + + +def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): + + t_group_new=[] + c_group_new=[] + + percent=int(per)/100 + c_col_filter=round(percent*(len(c_group[1])-2)) + t_col_filter=round(percent*(len(t_group[1])-2)) + + for i, _ in enumerate(c_group): + c_cols=0 + t_cols=0 + + c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) + t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) + + if c_cols>=c_col_filter or t_cols>=t_col_filter: + t_group_new.append(t_group[i]) + c_group_new.append(c_group[i]) + + fil_c_group.extend(c_group_new) + fil_t_group.extend(t_group_new) + +################################################################################################################################################################################################################## + + +def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per): + + if flag == 1 and int(per)!=-1: + fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + for x in fil_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in fil_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2 and int(per)!=-1: + fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + + for x in fil_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in fil_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 1: + fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + for x in raw_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in raw_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2: + fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + + for x in raw_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in raw_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + +######################################################################################################################################### + +def temp_counts_to_diff(names,samp,folder): + + for i in range(2,len(samp[0])): + + fp = open(folder+names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+names[i-2]+"\n") + + for x in samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + fp.close() + +################################################################################################################## + +def DB_write(con,name,unique_seq,sorted_uni_arms,f): + + if f==1: + # Write a txt file with all the information + if con=="c": + fp = open('split1/'+name, 'w') + + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split2/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + + if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: + + temp.append(unique_seq[j]) + + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + if f==2: + + if con=="c": + fp = open('split3/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split4/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: + temp.append(unique_seq[j]) + if temp!=[]: + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + +########################################################################################################################## + +def new_mat_seq(pre_unique_seq,mat_mirnas,l): + + unique_iso = [] + for x in pre_unique_seq: + if len(x[2].split("_"))==3: + for y in pre_unique_seq: + if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): + if any(y[2] in lst2 for lst2 in unique_iso)==False: + y[2]=">"+y[2] + unique_iso.append(y) + l.acquire() + for x in unique_iso: + mat_mirnas.append(x[2]) + mat_mirnas.append(x[9]) + l.release() + +######################################################################################################################### + +def merging_names(ini_mat,new): + + dupes=[] + final_mat =[] + + for num in range(len(ini_mat)): + + if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat: + final_mat.append(ini_mat[num][1]) + final_mat.append(ini_mat[num][0]) + else: + dupes.append(ini_mat[num][1]) + + dupes=list(set(dupes)) + + for i in range(len(dupes)): + dupes[i]=[dupes[i]] + + for x in ini_mat: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + + for x in ini_mat: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + ini_mat.sort() + ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) + + new.extend(ini_mat) + + +###################################################################################################################################################### + +def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder): + + for i in range(2,len(tem_samp[0])): + + fp = open(folder+tem_names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") + + for x in tem_samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + + for j in range(len(non_names)): + if non_names[j]==tem_names[i-2]: + for x in non_samp: + fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") + fp.close() + +################################################################################################################################################################################################################### + +""" + +This function downloads all the miRNAs of all the species from MirBase +and filters them by the requested organism + +input : Organism +output: A list with the miRNA sequences in fasta format + +""" + +def download_matures(matures,org_name): + + url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' + data = urllib.request.urlopen(url).read() + file_mirna = gzip.decompress(data).decode('utf-8') + file_mirna = file_mirna.split("\n") + + for i in range(0,len(file_mirna)-1,2): + + if org_name in file_mirna[i]: + matures.append(file_mirna[i]) + matures.append(file_mirna[i+1]) + +################################################################################################################################################################################################################### + + +""" + +This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis +These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template +mirnas + +""" + + +def non_template_ref(c_samples,t_samples,all_isoforms): + + pre_uni_seq_con = list(c_samples) + pre_uni_seq_tre = list(t_samples) + + for x in pre_uni_seq_con: + for y in x: + #if ">"+y[2] not in all_isoforms and ")_" in y[2] : + if ">"+y[2] not in all_isoforms and "_t_" in y[2] : + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + + for x in pre_uni_seq_tre: + for y in x: + #if ">"+y[2] not in all_isoforms and ")_" in y[2]: + if ">"+y[2] not in all_isoforms and "_t_" in y[2] : + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + +################################################################################################################################################################################################ + +""" + +This function adds the uncommon detected miRNAs among samples. +As a result all samples will have the same length. + +""" + +def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): + + for y in mir_names: + flag=0 + for x in sample: + if y[0]==x[0]: # check if miRNA exists in the sample + flag=1 + break + if flag==0: + sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence + + # sorting and remove duplicates + sample.sort(key=lambda x: x[0]) + sample=list(sample for sample,_ in itertools.groupby(sample)) + + # Return the updated sample + l.acquire() + new_d.append(sample) + sample_order.append(sample_name) + l.release() + +############################################################################################################################################################################################### +