Mercurial > repos > glogobyte > isoread
view mirbase_functions.py @ 8:de8a08bd4593 draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:06:27 +0000 |
parents | 47232a73a46b |
children | 77ba8dde6fb7 |
line wrap: on
line source
import itertools import re import urllib.request import gzip import copy from collections import OrderedDict # Read a file and return it as a list def read(path, flag): if flag == 0: with open(path) as fp: file=fp.readlines() fp.close() return file if flag == 1: with open(path) as fp: file = fp.read().splitlines() fp.close() return file # Write a list to a txt file def write(path, list): with open(path,'w') as fp: for x in list: fp.write(str("\t".join(x[1:-1]))) fp.close() #################################################################################################################> # Detect the longest common substring sequence between two mirnas def longestSubstring(str1, str2): from difflib import SequenceMatcher # initialize SequenceMatcher object with # input string seqMatch = SequenceMatcher(None, str1, str2) # find match of longest sub-string # output will be like Match(a=0, b=0, size=5) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) # print longest substring if (match.size != 0): return str1[match.a: match.a + match.size] else: print('No longest common sub-string found') ################################################################################################################################################################################################################# """ This function concatenates miRNAs which are generated from different chromosomes and eliminates the duplications of miRNAs on every sample input: detected miRNAs output: collpased miRNAs without duplicates """ def remove_duplicates(mirnas): # Detection of canonical mirRNAs whicha are generated from different chromosomes dupes=[[x[9],x[0],x[2]] for x in mirnas] for x in mirnas: for y in dupes: if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]: y.append(x[2]) # Detection of different chromosomes for every miRNA chr_order = [] for x in dupes: temp = [] for i in range(2,len(x)): if x[i].split("chr")[1].split("(")[0].isdigit(): temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) else: temp.append(x[i].split("chr")[1][0:4]) for z in temp: if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: temp = [str(j) for j in temp] temp = list(set(temp)) temp.sort() chr_order.append(temp) # Collapsing the miRNAs with the same sequence from different chromosomes collapsed_dupes=[] for i in range(len(dupes)): collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) for x in chr_order[i]: chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y' if chr_check is not None: if int(x)<0: # Check the strand (+) or (-) collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" else: collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" else: collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x) # Remove duplicates from collapsed_dupes collapsed_dupes.sort() collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes)) for i in range(len(mirnas)): for x in collapsed_dupes: # Naming of template isomirs (adding positions in the names) if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1]) mirnas[i][2] = x[1]+gg break # Naming of canonical miRNAs (collpsed names) if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: mirnas[i][2] = x[1] break # Remove duplicates mirnas.sort() mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas)) return mirnas ############################################################################################################################################################################################################# """ This function indentifies and classifies the miRNAs which are detected from the alignment tool. """ def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): # read the sam file ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc) sorted_uni_arms = [] for i in range(0,len(mature_mirnas,),2): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome mirna=unique_seq[j][2].split("_")[0][:-4] else: mirna=unique_seq[j][2].split("_")[0] # Detection of differences between the canonical miRNA and the detected miRNA if mature_mirnas[i].split(" ")[0][1:] == mirna: temp_mature = mature_mirnas[i+1].strip().replace("U", "T") off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) unique_diff = [len(unique_diff[0]), len(unique_diff[1])] # Handling of some special mirnas like (hsa-miR-8485) if mat_diff[1]!=0 and unique_diff[1]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 elif mat_diff[0]!=0 and unique_diff[0]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 else: # Keep the findings pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq+1 # Store the detected miRNAs with new names according to the findings if pre_pos != 0 or post_pos != 0: if pre_pos == 0: unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) elif post_pos == 0: unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) else: unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) for x in range(unique_seq.count(1)): unique_seq.remove(1) # metrics for the production of database if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) # Sorting of the metrics for database sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) # Collapsing of miRNAs and removing of duplicates collapsed_mirnas = remove_duplicates(unique_seq) # Correction of metrics due to the collapsing and removing of duplicates for the production of Database for y in sorted_uni_arms: counts=0 seqs=0 for x in collapsed_mirnas: if y[0]==x[2].split("_")[0]: counts+=int(x[0].split("-")[1]) seqs+=1 y[1]=seqs y[2]=counts # Output variables temp_mirna_names=[] l.acquire() if case == "c" or case == "t": temp_mirna_names.extend(z[2] for z in collapsed_mirnas) names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas]) mirna_names.extend(temp_mirna_names) unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing) samples.append(collapsed_mirnas) # return the processed detected miRNAs data.append([case,file,collapsed_mirnas,sorted_uni_arms]) ini_sample.append(filter_sam) # returns the filtered sam file l.release() ###################################################################################################################################### """ Read a sam file from Bowtie and do the followings: 1) Remove reverse stranded mapped reads 2) Remove unmapped reads 3) Remove all sequences with reads less than 11 reads 4) Sort the arms with the most sequences in decreading rate 5) Sort the sequences of every arm with the most reads in decreasing rate 6) Calculate total number of sequences of every arm 7) Calculate total number of reads of sequences of every arm. 8) Store all the informations in a txt file """ def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): # read the sam file ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq=[] unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] uni_seq=[] # Calculate the shifted positions for every non template mirna and add them to the name of it sorted_uni_arms = [] for i in range(1,len(mature_mirnas),2): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): temp_mature = mature_mirnas[i].strip().replace("U", "T") # Detection of differences between the canonical miRNA and the detected non template miRNA if temp_mature in unique_seq[j][9]: off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) if len(unique_diff)<=2: unique_diff = [len(unique_diff[0]), len(unique_diff[1])] pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] lengthofmir = len(off_part) + post_pos if pre_pos == 0 and post_pos<4: tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq + 1 t_name=unique_seq[j].copy() t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) uni_seq.append(t_name) # metrics for the production of database if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) # Output variables l.acquire() if case=="c" or case=="t": names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) file_order.append(file) data.append([case,file,unique_seq,sorted_uni_arms]) l.release() ################################################################################################################################################################################################################# def black_white(mirna_names_1,mirna_names_2,group,manager): add_names = [x for x in mirna_names_1 if x not in mirna_names_2] add_names.sort() add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) group.sort() group = list(group for group,_ in itertools.groupby(group)) zeros=["0"]*(len(group[0])-2) [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] group=group+add_names manager.extend(group) ################################################################################################################################################################################################################################ def merging_dupes(group,f_dupes): dupes=[] final_mat =[] for num,_ in enumerate(group): if group[num][1] not in final_mat and group[num][0] not in final_mat: final_mat.append(group[num][1]) final_mat.append(group[num][0]) else: dupes.append(group[num][1]) dupes=list(set(dupes)) dupes=[[x] for x in dupes] for x in group: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] f_dupes.extend(dupes) ########################################################################################################################################################################################################################################## def apply_merging_dupes(group,dupes,managger): for x in group: for y in dupes: if x[1]==y[0]: x[0]=y[1] group.sort() group=list(group for group,_ in itertools.groupby(group)) managger.extend(group) ############################################################################################################################################################################################################################### def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): t_group_new=[] c_group_new=[] percent=int(per)/100 c_col_filter=round(percent*(len(c_group[1])-2)) t_col_filter=round(percent*(len(t_group[1])-2)) for i, _ in enumerate(c_group): c_cols=0 t_cols=0 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) if c_cols>=c_col_filter or t_cols>=t_col_filter: t_group_new.append(t_group[i]) c_group_new.append(c_group[i]) fil_c_group.extend(c_group_new) fil_t_group.extend(t_group_new) ################################################################################################################################################################################################################## def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per): if flag == 1 and int(per)!=-1: fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in fil_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in fil_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2 and int(per)!=-1: fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in fil_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in fil_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 1: fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in raw_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in raw_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2: fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in raw_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in raw_con: fp.write("\n%s" % "\t".join(x)) fp.close() ######################################################################################################################################### def temp_counts_to_diff(names,samp,folder): for i in range(2,len(samp[0])): fp = open(folder+names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+names[i-2]+"\n") for x in samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") fp.close() ################################################################################################################## def DB_write(con,name,unique_seq,sorted_uni_arms,f): if f==1: # Write a txt file with all the information if con=="c": fp = open('split1/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split2/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: temp.append(unique_seq[j]) temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() if f==2: if con=="c": fp = open('split3/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split4/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: temp.append(unique_seq[j]) if temp!=[]: temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() ########################################################################################################################## def new_mat_seq(pre_unique_seq,mat_mirnas,l): unique_iso = [] for x in pre_unique_seq: if len(x[2].split("_"))==3: for y in pre_unique_seq: if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): if any(y[2] in lst2 for lst2 in unique_iso)==False: y[2]=">"+y[2] unique_iso.append(y) l.acquire() for x in unique_iso: mat_mirnas.append(x[2]) mat_mirnas.append(x[9]) l.release() ######################################################################################################################### def merging_names(ini_mat,new): dupes=[] final_mat =[] for num in range(len(ini_mat)): if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat: final_mat.append(ini_mat[num][1]) final_mat.append(ini_mat[num][0]) else: dupes.append(ini_mat[num][1]) dupes=list(set(dupes)) for i in range(len(dupes)): dupes[i]=[dupes[i]] for x in ini_mat: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in ini_mat: for y in dupes: if x[1]==y[0]: x[0]=y[1] ini_mat.sort() ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) new.extend(ini_mat) ###################################################################################################################################################### def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder): for i in range(2,len(tem_samp[0])): fp = open(folder+tem_names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") for x in tem_samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") for j in range(len(non_names)): if non_names[j]==tem_names[i-2]: for x in non_samp: fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") fp.close() ################################################################################################################################################################################################################### """ This function downloads all the miRNAs of all the species from MirBase and filters them by the requested organism input : Organism output: A list with the miRNA sequences in fasta format """ def download_matures(matures,org_name): url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' data = urllib.request.urlopen(url).read() file_mirna = gzip.decompress(data).decode('utf-8') file_mirna = file_mirna.split("\n") for i in range(0,len(file_mirna)-1,2): if org_name in file_mirna[i]: matures.append(file_mirna[i]) matures.append(file_mirna[i+1]) ################################################################################################################################################################################################################### """ This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template mirnas """ def non_template_ref(c_samples,t_samples,all_isoforms): pre_uni_seq_con = list(c_samples) pre_uni_seq_tre = list(t_samples) for x in pre_uni_seq_con: for y in x: #if ">"+y[2] not in all_isoforms and ")_" in y[2] : if ">"+y[2] not in all_isoforms and "_t_" in y[2] : all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) for x in pre_uni_seq_tre: for y in x: #if ">"+y[2] not in all_isoforms and ")_" in y[2]: if ">"+y[2] not in all_isoforms and "_t_" in y[2] : all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) ################################################################################################################################################################################################ """ This function adds the uncommon detected miRNAs among samples. As a result all samples will have the same length. """ def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): for y in mir_names: flag=0 for x in sample: if y[0]==x[0]: # check if miRNA exists in the sample flag=1 break if flag==0: sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence # sorting and remove duplicates sample.sort(key=lambda x: x[0]) sample=list(sample for sample,_ in itertools.groupby(sample)) # Return the updated sample l.acquire() new_d.append(sample) sample_order.append(sample_name) l.release() ###############################################################################################################################################################################################