Mercurial > repos > glogobyte > armdb
diff armdb_mirbase.py @ 11:6c3c84d4a5ea draft
Uploaded
author | glogobyte |
---|---|
date | Thu, 02 Dec 2021 14:07:22 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/armdb_mirbase.py Thu Dec 02 14:07:22 2021 +0000 @@ -0,0 +1,137 @@ +import subprocess +import argparse +import time +import urllib.request +from multiprocessing import Process, Queue +import itertools + +#---------------------------------Arguments------------------------------------------ + +subprocess.call(['mkdir', 'out']) +parser = argparse.ArgumentParser() + +parser.add_argument("-pos", "--positions", help="number of additional nucleotides", action="store") +parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") +parser.add_argument("-gen", "--genome", help="genome version", action="store") +parser.add_argument("-gff3", "--gff", help="gff file",action="store") +args = parser.parse_args() + +#-----------------------Download and read of the gff3 file--------------------------------- + +def read_url(q): + + url = 'https://www.mirbase.org/ftp/CURRENT/genomes/'+args.gff + data = urllib.request.urlopen(url).read() + file_mirna = data.decode('utf-8') + file_mirna = file_mirna.split("\n") + q.put(file_mirna) + + +#-----------------------Export of the original gff3 file--------------------------------- + +def write_gff(file_mirna): + f = open('original_mirnas.bed', "w") + + for i in range(len(file_mirna)): + f.write(file_mirna[i] + "\n") + +#------------------------Process and export of the file with mature mirnas------------------------------- + +def new_gff(file_mirna): + + mirna = [] # new list with shifted mirnas + positions =int(args.positions) # positions shifted + print(str(positions)+" positions shifted") + names=[] + + for i in range(len(file_mirna)): + + # Remove lines which conatain the word "primary" + if "primary" not in file_mirna[i]: + mirna.append(file_mirna[i]) + + # Check if the line starts with "chr" + if "chr" in file_mirna[i]: + a=file_mirna[i].split("\t")[0] + b=file_mirna[i].split("\t")[6] + c=file_mirna[i].split("=")[3].split(";")[0] + names.append([a,b,c]) + + names.sort() + sublists=[] + + [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2] + sublists.sort() + sublists=list(sublists for sublists, _ in itertools.groupby(sublists)) + unique_names=[[x[0][0],x[0][2]] for x in sublists] + + for x in unique_names: + flag = 0 + for i in range(len(mirna)): + + if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]: + flag+=1 + ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3] + mirna[i]=ktr + + + f = open('shifted_mirnas.bed', "w") + + for i in range(len(mirna)): + + if "chr" in mirna[i]: + + # change the name of current mirna + mirna_name_1 = mirna[i].split("=")[3] + mirna_name_2 = mirna[i].split("=")[4] + mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0] + mirna[i] = mirna[i].replace("miRNA", mirna_name_1) + + # Shift the position of mirna + start = mirna[i].split("\t")[3] + end = mirna[i].split("\t")[4] + shift_start = int(start)-positions # shift the interval to the left + shift_end = int(end)+positions # shift the interval to the right + + # Replace the previous intervals with the new + mirna[i] = mirna[i].replace(start, str(shift_start)) + mirna[i] = mirna[i].replace(end, str(shift_end)) + + f.write(mirna[i] + "\n") + + f.close() + + +#------------------------Extract the sequences of the Custom Arms with getfasta tool------------------------------- + +def bedtool(genome): + subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"]) +#=================================================================================================================================== + + +if __name__=='__main__': + + starttime = time.time() + q = Queue() + + # Read the original gff3 file + p1 = Process(target=read_url(q)) + p1.start() + p1.join() + file_mirna=q.get() + + # Export the original gff3 file + p2 = [Process(target=write_gff(file_mirna))] + + # Create the new gff3 file + p2.extend([Process(target=new_gff(file_mirna))]) + [x.start() for x in p2] + [x.join() for x in p2] + + # Extract the sequences of the Custom Arms + p3 = Process(target=bedtool(args.genome)) + p3.start() + p3.join() + + print('Runtime: {} seconds'.format(round(time.time() - starttime,2))) +