Mercurial > repos > glogobyte > isoread
changeset 5:810e789ffeab draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:54 +0000 |
parents | f44185f616bc |
children | d58f050acd18 |
files | mirgene_functions.py |
diffstat | 1 files changed, 717 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mirgene_functions.py Wed Oct 13 16:04:54 2021 +0000 @@ -0,0 +1,717 @@ +import itertools +import urllib.request +from collections import OrderedDict +import copy + +######################################################################################################################################################## + +""" Read a file and return it as a list """ + +def read(path, flag): + if flag == 0: + with open(path) as fp: + file=fp.readlines() + fp.close() + return file + + if flag == 1: + with open(path) as fp: + file = fp.read().splitlines() + fp.close() + return file + +# Write a list to a txt file +def write(path, list): + with open(path,'w') as fp: + for x in list: + fp.write(str("\t".join(x[1:-1]))) + fp.close() + +######################################################################################################################################################## + +""" Detect the longest common substring sequence between two mirnas """ + +def longestSubstring(str1, str2): + + from difflib import SequenceMatcher + # initialize SequenceMatcher object with + # input string + seqMatch = SequenceMatcher(None, str1, str2) + + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + + # print longest substring + if (match.size != 0): + return str1[match.a: match.a + match.size] + else: + print('No longest common sub-string found') + + +################################################################################################################################################################# + +""" + +Read the sam files from alignment tool and do the followings: + +1) Keep mapped reads +2) Keep all sequences with length between 18 and 26 nucleotides +3) Detects the ref and templated miRNAs +4) Gives names to templated miRNAs based on ref miRNAs + +""" + +def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): + + # read the sam file + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] + filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] + sorted_uni_arms = [] + + # Detection of differences between the canonical miRNA and the detected miRNA + for i in range(len(mature_mirnas)): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + for j in range(len(unique_seq)): + + if mature_mirnas[i] == unique_seq[j][2]: + + temp_mature = mature_mirnas[i+1] + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + # Handling of some special mirnas like (hsa-miR-8485) + if mat_diff[1]!=0 and unique_diff[1]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + elif mat_diff[0]!=0 and unique_diff[0]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + else: + # Keep the findings + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq+1 + + # Store the detected miRNAs with new names according to the findings + if pre_pos != 0 or post_pos != 0: + if pre_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) + elif post_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) + else: + unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) + + # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) + for x in range(unique_seq.count(1)): + unique_seq.remove(1) + + # metrics for the production of database + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) + + # Sorting of the metrics for database + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + + # Correction of metrics due to the collapsing and removing of duplicates for the production of Database + for y in sorted_uni_arms: + counts=0 + seqs=0 + for x in unique_seq: + if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: + counts+=int(x[0].split("-")[1]) + seqs+=1 + + y[1]=seqs + y[2]=counts + + # Output variables + temp_mirna_names=[] + + l.acquire() + if case == "c" or case == "t": + temp_mirna_names.extend(z[2] for z in unique_seq) + names_n_seqs.extend([[y[2],y[9]] for y in unique_seq]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) + mirna_names.extend(temp_mirna_names) + unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) + file_order.append(file) + samples.append(unique_seq) + data.append([case,file,unique_seq,sorted_uni_arms]) + ini_sample.append(filter_sam) + l.release() + + +###################################################################################################################################### +""" + +Read a sam file from Bowtie and do the followings: + +1) Keep unmapped reads +2) Keep all sequences with length between 18 and 26 nucleotides +3) Detects the non-template isomirs +4) Gives names to isomir's based on ref miRNAs + +""" + +def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): + + # read the sam file + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq=[] + unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] + uni_seq=[] + + # Calculate the shifted positions for every isomir and add them to the name of it + sorted_uni_arms = [] + for i in range(1,len(mature_mirnas),2): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + + for j in range(len(unique_seq)): + + temp_mature = mature_mirnas[i].strip().replace("U", "T") + + # Detection of differences between the canonical miRNA and the detected non template miRNA + if temp_mature in unique_seq[j][9]: + + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + if len(unique_diff)<=2: + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + + lengthofmir = len(off_part) + post_pos + if pre_pos == 0 and post_pos<4: + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq + 1 + + t_name=copy.deepcopy(unique_seq[j]) + t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) + uni_seq.append(t_name) + + # metrics for the production of database + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) + + + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) + + # Output variables + + l.acquire() + if case == "c" or case == "t": + names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + file_order.append(file) + data.append([case,file,unique_seq,sorted_uni_arms]) + l.release() + +################################################################################################################################################################################################################# + +""" + +This function detects the differences between the two groups (control, treated). +Then copy the undetected miRNAs from one group to other and add zeros as counts. +With this way the two groups will have the same number of miRNAs. + +""" + +def black_white(mirna_names_1,mirna_names_2,group,manager): + + add_names = [x for x in mirna_names_1 if x not in mirna_names_2] + add_names.sort() + add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) + + group.sort() + group = list(group for group,_ in itertools.groupby(group)) + + zeros=["0"]*(len(group[0])-2) + [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] + group=group+add_names + + manager.extend(group) + +########################################################################################################> + +""" + +This function collapses the miRNAs with same sequences and different names into one entry +by merging all the different names into one and are separated with the character "/" + +""" + +def merging_dupes(group,f_dupes): + + dupes=[] + temp_mat =[] + + for num,_ in enumerate(group): + + if group[num][1] not in temp_mat and group[num][0] not in temp_mat: + temp_mat.append(group[num][1]) + temp_mat.append(group[num][0]) + else: + dupes.append(group[num][1]) + + + dupes=list(set(dupes)) + + dupes=[[x] for x in dupes] + + for x in group: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + f_dupes.extend(dupes) + + +########################################################################################################> + +""" + +This function removes the duplications of sequences based on output from the fuction merging_dupes + +""" + +def apply_merging_dupes(group,dupes,managger): + + for x in group: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + group.sort() + group=list(group for group,_ in itertools.groupby(group)) + managger.extend(group) + +########################################################################################################> + +""" + +This function is optional and performs a filter for low counts miRNAs based on +number of counts and the percentage of the samples, according to user preferences + +""" + +def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): + + t_group_new=[] + c_group_new=[] + + percent=int(per)/100 + c_col_filter=round(percent*(len(c_group[1])-2)) + t_col_filter=round(percent*(len(t_group[1])-2)) + + for i, _ in enumerate(c_group): + c_cols=0 + t_cols=0 + + c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) + t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) + + if c_cols>=c_col_filter or t_cols>=t_col_filter: + t_group_new.append(t_group[i]) + c_group_new.append(c_group[i]) + + fil_c_group.extend(c_group_new) + fil_t_group.extend(t_group_new) + +################################################################################################################################################################################################################## + +""" + +This function exports the count matrices for every group (controls, treated) +and condition (ref and templated miRNAs, non-templated miRNAs) + +""" + +def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per): + + if flag == 1 and int(per)!=-1: + fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + for x in fil_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in fil_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2 and int(per)!=-1: + fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + + for x in fil_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in fil_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 1: + fp = open('Counts/Raw '+n2+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + for x in raw_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in raw_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + if flag == 2: + fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in tre_file_order: + fp.write("\t"+y) + + + for x in raw_tre: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in con_file_order: + fp.write("\t"+y) + + for x in raw_con: + fp.write("\n%s" % "\t".join(x)) + fp.close() + +#################################################################################################################################################################################################################### + +""" + +This function exports the files of the database with all the info +about every type of the detected miRNAs for every sample + +""" + +def DB_write(con,name,unique_seq,sorted_uni_arms,f): + + if f==1: + if con=="c": + fp = open('split1/'+name, 'w') + + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split2/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + + if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): + + temp.append(unique_seq[j]) + + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + if f==2: + + if con=="c": + fp = open('split3/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split4/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: + temp.append(unique_seq[j]) + if temp!=[]: + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + +######################################################################################################################### + +""" + +This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates + +""" + +def merging_names(ini_mat,new): + + dupes=[] + temp_mat =[] + + for num in range(len(ini_mat)): + + if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat: + temp_mat.append(ini_mat[num][1]) + temp_mat.append(ini_mat[num][0]) + else: + dupes.append(ini_mat[num][1]) + + dupes=list(set(dupes)) + + for i in range(len(dupes)): + dupes[i]=[dupes[i]] + + for x in ini_mat: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + + for x in ini_mat: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + ini_mat.sort() + ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) + new.extend(ini_mat) + +#################################################################################################################################################################################################################### + +""" + +This function exports the count matrices for differential expresion +if user chose analysis with non-templated miRNAs detection + +""" + +def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro): + + for i in range(2,len(tem_samp[0])): + + fp = open(folder+tem_names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") + + for x in tem_samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + + for j in range(len(non_names)): + if non_names[j]==tem_names[i-2]: + for x in non_samp: + fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") + fp.close() + +################################################################################################################################################################################################################# + +""" + +This function exports the count matrices for differential expresion +if user chose analysis only with templated miRNAs detection + +""" + +def temp_counts_to_diff(names,samp,folder,pro): + + for i in range(2,len(samp[0])): + + fp = open(folder+names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+names[i-2]+"\n") + + for x in samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + fp.close() + +################################################################################################################################################################################################################# + +""" + +This function downloads the fasta files from MirGene site +with ref miRNAs and star miRNAs sequences and merges them +into one list + +""" + +def download_matures(matures,org_name): + + mature_mir=[] + + mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' + star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' + + data = urllib.request.urlopen(mat_url).read() + file_mirna = data.decode('utf-8') + mature_mir = file_mirna.split("\n") + mature_mir = [x.replace(">","") for x in mature_mir] + del mature_mir[-1] + + data = urllib.request.urlopen(star_url).read() + file_mirna = data.decode('utf-8') + star_mir = file_mirna.split("\n") + star_mir = [x.replace(">","") for x in star_mir] + del star_mir[-1] + + mature_mir.extend(star_mir) + + for i in range(1,len(mature_mir),2): + mature_mir[i]=mature_mir[i].replace("U","T") + + matures.extend(mature_mir) + +################################################################################################################### + +""" + +This function detects the templated isoforms from the 1st part of analysis +These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs + +""" + +def non_template_ref(sc,st,all_isoforms): + + pre_uni_seq_con = list(sc) + pre_uni_seq_tre = list(st) + + for x in pre_uni_seq_con: + for y in x: + if y[2] not in all_isoforms and "_t_" in y[2]: + all_isoforms.append(y[2]) + all_isoforms.append(y[9]) + + for x in pre_uni_seq_tre: + for y in x: + if y[2] not in all_isoforms and "_t_" in y[2]: + all_isoforms.append(y[2]) + all_isoforms.append(y[9]) + +################################################################################################################################################################################################ + +""" + +This function adds uncommon detected mirnas among the samples with zeros as counts + +""" + +def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): + + for y in mir_names: + flag=0 + for x in sample: + if y[0]==x[0]: # check if miRNA exists in the sample + flag=1 + break + if flag==0: + sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence + + # sorting and remove duplicates + sample.sort(key=lambda x: x[0]) + sample=list(sample for sample,_ in itertools.groupby(sample)) + + # Return the updated sample + l.acquire() + new_d.append(sample) + sample_order.append(sample_name) + l.release() + +###############################################################################################################################################################################################