Mercurial > repos > glogobyte > isoread
view mirgene_functions.py @ 22:958330b9bce3 draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 20 Oct 2021 12:11:30 +0000 |
parents | 810e789ffeab |
children |
line wrap: on
line source
import itertools import urllib.request from collections import OrderedDict import copy ######################################################################################################################################################## """ Read a file and return it as a list """ def read(path, flag): if flag == 0: with open(path) as fp: file=fp.readlines() fp.close() return file if flag == 1: with open(path) as fp: file = fp.read().splitlines() fp.close() return file # Write a list to a txt file def write(path, list): with open(path,'w') as fp: for x in list: fp.write(str("\t".join(x[1:-1]))) fp.close() ######################################################################################################################################################## """ Detect the longest common substring sequence between two mirnas """ def longestSubstring(str1, str2): from difflib import SequenceMatcher # initialize SequenceMatcher object with # input string seqMatch = SequenceMatcher(None, str1, str2) # find match of longest sub-string # output will be like Match(a=0, b=0, size=5) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) # print longest substring if (match.size != 0): return str1[match.a: match.a + match.size] else: print('No longest common sub-string found') ################################################################################################################################################################# """ Read the sam files from alignment tool and do the followings: 1) Keep mapped reads 2) Keep all sequences with length between 18 and 26 nucleotides 3) Detects the ref and templated miRNAs 4) Gives names to templated miRNAs based on ref miRNAs """ def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): # read the sam file ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] sorted_uni_arms = [] # Detection of differences between the canonical miRNA and the detected miRNA for i in range(len(mature_mirnas)): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): if mature_mirnas[i] == unique_seq[j][2]: temp_mature = mature_mirnas[i+1] off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) unique_diff = [len(unique_diff[0]), len(unique_diff[1])] # Handling of some special mirnas like (hsa-miR-8485) if mat_diff[1]!=0 and unique_diff[1]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 elif mat_diff[0]!=0 and unique_diff[0]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 else: # Keep the findings pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq+1 # Store the detected miRNAs with new names according to the findings if pre_pos != 0 or post_pos != 0: if pre_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) elif post_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) else: unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) for x in range(unique_seq.count(1)): unique_seq.remove(1) # metrics for the production of database if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) # Sorting of the metrics for database sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) # Correction of metrics due to the collapsing and removing of duplicates for the production of Database for y in sorted_uni_arms: counts=0 seqs=0 for x in unique_seq: if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: counts+=int(x[0].split("-")[1]) seqs+=1 y[1]=seqs y[2]=counts # Output variables temp_mirna_names=[] l.acquire() if case == "c" or case == "t": temp_mirna_names.extend(z[2] for z in unique_seq) names_n_seqs.extend([[y[2],y[9]] for y in unique_seq]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) mirna_names.extend(temp_mirna_names) unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) file_order.append(file) samples.append(unique_seq) data.append([case,file,unique_seq,sorted_uni_arms]) ini_sample.append(filter_sam) l.release() ###################################################################################################################################### """ Read a sam file from Bowtie and do the followings: 1) Keep unmapped reads 2) Keep all sequences with length between 18 and 26 nucleotides 3) Detects the non-template isomirs 4) Gives names to isomir's based on ref miRNAs """ def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): # read the sam file ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq=[] unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] uni_seq=[] # Calculate the shifted positions for every isomir and add them to the name of it sorted_uni_arms = [] for i in range(1,len(mature_mirnas),2): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): temp_mature = mature_mirnas[i].strip().replace("U", "T") # Detection of differences between the canonical miRNA and the detected non template miRNA if temp_mature in unique_seq[j][9]: off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) if len(unique_diff)<=2: unique_diff = [len(unique_diff[0]), len(unique_diff[1])] pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] lengthofmir = len(off_part) + post_pos if pre_pos == 0 and post_pos<4: tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq + 1 t_name=copy.deepcopy(unique_seq[j]) t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) uni_seq.append(t_name) # metrics for the production of database if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) # Output variables l.acquire() if case == "c" or case == "t": names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) file_order.append(file) data.append([case,file,unique_seq,sorted_uni_arms]) l.release() ################################################################################################################################################################################################################# """ This function detects the differences between the two groups (control, treated). Then copy the undetected miRNAs from one group to other and add zeros as counts. With this way the two groups will have the same number of miRNAs. """ def black_white(mirna_names_1,mirna_names_2,group,manager): add_names = [x for x in mirna_names_1 if x not in mirna_names_2] add_names.sort() add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) group.sort() group = list(group for group,_ in itertools.groupby(group)) zeros=["0"]*(len(group[0])-2) [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] group=group+add_names manager.extend(group) ########################################################################################################> """ This function collapses the miRNAs with same sequences and different names into one entry by merging all the different names into one and are separated with the character "/" """ def merging_dupes(group,f_dupes): dupes=[] temp_mat =[] for num,_ in enumerate(group): if group[num][1] not in temp_mat and group[num][0] not in temp_mat: temp_mat.append(group[num][1]) temp_mat.append(group[num][0]) else: dupes.append(group[num][1]) dupes=list(set(dupes)) dupes=[[x] for x in dupes] for x in group: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] f_dupes.extend(dupes) ########################################################################################################> """ This function removes the duplications of sequences based on output from the fuction merging_dupes """ def apply_merging_dupes(group,dupes,managger): for x in group: for y in dupes: if x[1]==y[0]: x[0]=y[1] group.sort() group=list(group for group,_ in itertools.groupby(group)) managger.extend(group) ########################################################################################################> """ This function is optional and performs a filter for low counts miRNAs based on number of counts and the percentage of the samples, according to user preferences """ def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): t_group_new=[] c_group_new=[] percent=int(per)/100 c_col_filter=round(percent*(len(c_group[1])-2)) t_col_filter=round(percent*(len(t_group[1])-2)) for i, _ in enumerate(c_group): c_cols=0 t_cols=0 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) if c_cols>=c_col_filter or t_cols>=t_col_filter: t_group_new.append(t_group[i]) c_group_new.append(c_group[i]) fil_c_group.extend(c_group_new) fil_t_group.extend(t_group_new) ################################################################################################################################################################################################################## """ This function exports the count matrices for every group (controls, treated) and condition (ref and templated miRNAs, non-templated miRNAs) """ def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per): if flag == 1 and int(per)!=-1: fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in fil_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in fil_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2 and int(per)!=-1: fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in fil_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in fil_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 1: fp = open('Counts/Raw '+n2+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in raw_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in raw_con: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2: fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in tre_file_order: fp.write("\t"+y) for x in raw_tre: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in con_file_order: fp.write("\t"+y) for x in raw_con: fp.write("\n%s" % "\t".join(x)) fp.close() #################################################################################################################################################################################################################### """ This function exports the files of the database with all the info about every type of the detected miRNAs for every sample """ def DB_write(con,name,unique_seq,sorted_uni_arms,f): if f==1: if con=="c": fp = open('split1/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split2/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): temp.append(unique_seq[j]) temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() if f==2: if con=="c": fp = open('split3/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split4/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: temp.append(unique_seq[j]) if temp!=[]: temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() ######################################################################################################################### """ This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates """ def merging_names(ini_mat,new): dupes=[] temp_mat =[] for num in range(len(ini_mat)): if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat: temp_mat.append(ini_mat[num][1]) temp_mat.append(ini_mat[num][0]) else: dupes.append(ini_mat[num][1]) dupes=list(set(dupes)) for i in range(len(dupes)): dupes[i]=[dupes[i]] for x in ini_mat: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in ini_mat: for y in dupes: if x[1]==y[0]: x[0]=y[1] ini_mat.sort() ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) new.extend(ini_mat) #################################################################################################################################################################################################################### """ This function exports the count matrices for differential expresion if user chose analysis with non-templated miRNAs detection """ def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro): for i in range(2,len(tem_samp[0])): fp = open(folder+tem_names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") for x in tem_samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") for j in range(len(non_names)): if non_names[j]==tem_names[i-2]: for x in non_samp: fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") fp.close() ################################################################################################################################################################################################################# """ This function exports the count matrices for differential expresion if user chose analysis only with templated miRNAs detection """ def temp_counts_to_diff(names,samp,folder,pro): for i in range(2,len(samp[0])): fp = open(folder+names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+names[i-2]+"\n") for x in samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") fp.close() ################################################################################################################################################################################################################# """ This function downloads the fasta files from MirGene site with ref miRNAs and star miRNAs sequences and merges them into one list """ def download_matures(matures,org_name): mature_mir=[] mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' data = urllib.request.urlopen(mat_url).read() file_mirna = data.decode('utf-8') mature_mir = file_mirna.split("\n") mature_mir = [x.replace(">","") for x in mature_mir] del mature_mir[-1] data = urllib.request.urlopen(star_url).read() file_mirna = data.decode('utf-8') star_mir = file_mirna.split("\n") star_mir = [x.replace(">","") for x in star_mir] del star_mir[-1] mature_mir.extend(star_mir) for i in range(1,len(mature_mir),2): mature_mir[i]=mature_mir[i].replace("U","T") matures.extend(mature_mir) ################################################################################################################### """ This function detects the templated isoforms from the 1st part of analysis These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs """ def non_template_ref(sc,st,all_isoforms): pre_uni_seq_con = list(sc) pre_uni_seq_tre = list(st) for x in pre_uni_seq_con: for y in x: if y[2] not in all_isoforms and "_t_" in y[2]: all_isoforms.append(y[2]) all_isoforms.append(y[9]) for x in pre_uni_seq_tre: for y in x: if y[2] not in all_isoforms and "_t_" in y[2]: all_isoforms.append(y[2]) all_isoforms.append(y[9]) ################################################################################################################################################################################################ """ This function adds uncommon detected mirnas among the samples with zeros as counts """ def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): for y in mir_names: flag=0 for x in sample: if y[0]==x[0]: # check if miRNA exists in the sample flag=1 break if flag==0: sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence # sorting and remove duplicates sample.sort(key=lambda x: x[0]) sample=list(sample for sample,_ in itertools.groupby(sample)) # Return the updated sample l.acquire() new_d.append(sample) sample_order.append(sample_name) l.release() ###############################################################################################################################################################################################