Mercurial > repos > glogobyte > isoread
changeset 36:e55fa008e518 draft
Deleted selected files
author | glogobyte |
---|---|
date | Tue, 17 Oct 2023 08:21:45 +0000 |
parents | 293aa8cbbc20 |
children | b8de171556f0 |
files | mirbase_functions.py |
diffstat | 1 files changed, 0 insertions(+), 766 deletions(-) [+] |
line wrap: on
line diff
--- a/mirbase_functions.py Mon Apr 25 07:34:04 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,766 +0,0 @@ -import itertools -import re -import urllib.request -import gzip -import copy -from collections import OrderedDict - - - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - - -#################################################################################################################> - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - -################################################################################################################################################################################################################# - -""" - -This function concatenates miRNAs which are generated from different chromosomes -and eliminates the duplications of miRNAs on every sample - -input: detected miRNAs -output: collpased miRNAs without duplicates - -""" - - -def remove_duplicates(mirnas): - - # Detection of canonical mirRNAs whicha are generated from different chromosomes - dupes=[[x[9],x[0],x[2]] for x in mirnas] - - for x in mirnas: - for y in dupes: - if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]: - y.append(x[2]) - - # Detection of different chromosomes for every miRNA - chr_order = [] - for x in dupes: - temp = [] - for i in range(2,len(x)): - if x[i].split("chr")[1].split("(")[0].isdigit(): - temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) - else: - temp.append(x[i].split("chr")[1][0:4]) - - for z in temp: - if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: - temp = [str(j) for j in temp] - temp = list(set(temp)) - temp.sort() - chr_order.append(temp) - - # Collapsing the miRNAs with the same sequence from different chromosomes - collapsed_dupes=[] - for i in range(len(dupes)): - collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) - for x in chr_order[i]: - chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y' - if chr_check is not None: - if int(x)<0: # Check the strand (+) or (-) - collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" - else: - collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" - else: - collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x) - - # Remove duplicates from collapsed_dupes - collapsed_dupes.sort() - collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes)) - - for i in range(len(mirnas)): - for x in collapsed_dupes: - - # Naming of template isomirs (adding positions in the names) - if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: - gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1]) - mirnas[i][2] = x[1]+gg - break - - # Naming of canonical miRNAs (collpsed names) - if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: - mirnas[i][2] = x[1] - break - - # Remove duplicates - mirnas.sort() - mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas)) - - return mirnas - -############################################################################################################################################################################################################# - -""" - -This function indentifies and classifies the miRNAs which are detected from the alignment tool. - -""" - -def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): - - # read the sam file - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction - unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs - filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc) - - sorted_uni_arms = [] - - for i in range(0,len(mature_mirnas,),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome - mirna=unique_seq[j][2].split("_")[0][:-4] - else: - mirna=unique_seq[j][2].split("_")[0] - - # Detection of differences between the canonical miRNA and the detected miRNA - if mature_mirnas[i].split(" ")[0][1:] == mirna: - - temp_mature = mature_mirnas[i+1].strip().replace("U", "T") - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Handling of some special mirnas like (hsa-miR-8485) - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - # Keep the findings - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - # Store the detected miRNAs with new names according to the findings - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - - # metrics for the production of database - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - # Sorting of the metrics for database - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - - # Collapsing of miRNAs and removing of duplicates - collapsed_mirnas = remove_duplicates(unique_seq) - - # Correction of metrics due to the collapsing and removing of duplicates for the production of Database - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in collapsed_mirnas: - if y[0] in x[2].split("_")[0]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - - # Output variables - temp_mirna_names=[] - - l.acquire() - - if case == "c" or case == "t": - temp_mirna_names.extend(z[2] for z in collapsed_mirnas) - names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas]) - mirna_names.extend(temp_mirna_names) - unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph - file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing) - samples.append(collapsed_mirnas) # return the processed detected miRNAs - data.append([case,file,collapsed_mirnas,sorted_uni_arms]) - ini_sample.append(filter_sam) # returns the filtered sam file - - l.release() - - -###################################################################################################################################### - - -""" - -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): - - # read the sam file - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - - uni_seq=[] - - # Calculate the shifted positions for every non template mirna and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - # Detection of differences between the canonical miRNA and the detected non template miRNA - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0 and post_pos<4: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - t_name=unique_seq[j].copy() - t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - # metrics for the production of database - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - # Output variables - l.acquire() - if case=="c" or case=="t": - names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - file_order.append(file) - data.append([case,file,unique_seq,sorted_uni_arms]) - l.release() - -################################################################################################################################################################################################################# - -def black_white(mirna_names_1,mirna_names_2,group,manager): - - add_names = [x for x in mirna_names_1 if x not in mirna_names_2] - add_names.sort() - add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) - - group.sort() - group = list(group for group,_ in itertools.groupby(group)) - - zeros=["0"]*(len(group[0])-2) - [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] - group=group+add_names - - manager.extend(group) - -################################################################################################################################################################################################################################ - -def merging_dupes(group,f_dupes): - - dupes=[] - final_mat =[] - - for num,_ in enumerate(group): - - if group[num][1] not in final_mat and group[num][0] not in final_mat: - final_mat.append(group[num][1]) - final_mat.append(group[num][0]) - else: - dupes.append(group[num][1]) - - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in group: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - f_dupes.extend(dupes) - -########################################################################################################################################################################################################################################## - -def apply_merging_dupes(group,dupes,managger): - - for x in group: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - group.sort() - group=list(group for group,_ in itertools.groupby(group)) - managger.extend(group) - -############################################################################################################################################################################################################################### - - -def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): - - t_group_new=[] - c_group_new=[] - - percent=int(per)/100 - c_col_filter=round(percent*(len(c_group[1])-2)) - t_col_filter=round(percent*(len(t_group[1])-2)) - - for i, _ in enumerate(c_group): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) - t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - t_group_new.append(t_group[i]) - c_group_new.append(c_group[i]) - - fil_c_group.extend(c_group_new) - fil_t_group.extend(t_group_new) - -################################################################################################################################################################################################################## - - -def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in tre_file_order: - fp.write("\t"+y) - - for x in fil_tre: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in con_file_order: - fp.write("\t"+y) - - for x in fil_con: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in tre_file_order: - fp.write("\t"+y) - - - for x in fil_tre: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in con_file_order: - fp.write("\t"+y) - - for x in fil_con: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in tre_file_order: - fp.write("\t"+y) - - for x in raw_tre: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in con_file_order: - fp.write("\t"+y) - - for x in raw_con: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2: - fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in tre_file_order: - fp.write("\t"+y) - - - for x in raw_tre: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in con_file_order: - fp.write("\t"+y) - - for x in raw_con: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - -######################################################################################################################################### - -def temp_counts_to_diff(names,samp,folder): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -################################################################################################################## - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): - if any(y[2] in lst2 for lst2 in unique_iso)==False: - y[2]=">"+y[2] - unique_iso.append(y) - l.acquire() - for x in unique_iso: - mat_mirnas.append(x[2]) - mat_mirnas.append(x[9]) - l.release() - -######################################################################################################################### - -def merging_names(ini_mat,new): - - dupes=[] - final_mat =[] - - for num in range(len(ini_mat)): - - if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat: - final_mat.append(ini_mat[num][1]) - final_mat.append(ini_mat[num][0]) - else: - dupes.append(ini_mat[num][1]) - - dupes=list(set(dupes)) - - for i in range(len(dupes)): - dupes[i]=[dupes[i]] - - for x in ini_mat: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in ini_mat: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - ini_mat.sort() - ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) - - new.extend(ini_mat) - - -###################################################################################################################################################### - -def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################### - -""" - -This function downloads all the miRNAs of all the species from MirBase -and filters them by the requested organism - -input : Organism -output: A list with the miRNA sequences in fasta format - -""" - -def download_matures(matures,org_name): - - url = 'https://www.mirbase.org/ftp/CURRENT/mature.fa.gz' - data = urllib.request.urlopen(url).read() - file_mirna = gzip.decompress(data).decode('utf-8') - file_mirna = file_mirna.split("\n") - - for i in range(0,len(file_mirna)-1,2): - - if org_name in file_mirna[i]: - matures.append(file_mirna[i]) - matures.append(file_mirna[i+1]) - -################################################################################################################################################################################################################### - - -""" - -This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis -These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template -mirnas - -""" - - -def non_template_ref(c_samples,t_samples,all_isoforms): - - pre_uni_seq_con = list(c_samples) - pre_uni_seq_tre = list(t_samples) - - for x in pre_uni_seq_con: - for y in x: - #if ">"+y[2] not in all_isoforms and ")_" in y[2] : - if ">"+y[2] not in all_isoforms and "_t_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - - for x in pre_uni_seq_tre: - for y in x: - #if ">"+y[2] not in all_isoforms and ")_" in y[2]: - if ">"+y[2] not in all_isoforms and "_t_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -""" - -This function adds the uncommon detected miRNAs among samples. -As a result all samples will have the same length. - -""" - -def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: # check if miRNA exists in the sample - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence - - # sorting and remove duplicates - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - # Return the updated sample - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### -