Mercurial > repos > glogobyte > armdb
view armdb_mirbase.py @ 8:6f0fa933b3ed draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 20 Oct 2021 15:16:49 +0000 |
parents | 40c50594116c |
children |
line wrap: on
line source
import subprocess import argparse import time import urllib.request from multiprocessing import Process, Queue import itertools #---------------------------------Arguments------------------------------------------ subprocess.call(['mkdir', 'out']) parser = argparse.ArgumentParser() parser.add_argument("-pos", "--positions", help="number of additional nucleotides", action="store") parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") parser.add_argument("-gen", "--genome", help="genome version", action="store") parser.add_argument("-gff3", "--gff", help="gff file",action="store") args = parser.parse_args() #-----------------------Download and read of the gff3 file--------------------------------- def read_url(q): url = 'ftp://mirbase.org/pub/mirbase/CURRENT/genomes/'+args.gff data = urllib.request.urlopen(url).read() file_mirna = data.decode('utf-8') file_mirna = file_mirna.split("\n") q.put(file_mirna) #-----------------------Export of the original gff3 file--------------------------------- def write_gff(file_mirna): f = open('original_mirnas.bed', "w") for i in range(len(file_mirna)): f.write(file_mirna[i] + "\n") #------------------------Process and export of the file with mature mirnas------------------------------- def new_gff(file_mirna): mirna = [] # new list with shifted mirnas positions =int(args.positions) # positions shifted print(str(positions)+" positions shifted") names=[] for i in range(len(file_mirna)): # Remove lines which conatain the word "primary" if "primary" not in file_mirna[i]: mirna.append(file_mirna[i]) # Check if the line starts with "chr" if "chr" in file_mirna[i]: a=file_mirna[i].split("\t")[0] b=file_mirna[i].split("\t")[6] c=file_mirna[i].split("=")[3].split(";")[0] names.append([a,b,c]) names.sort() sublists=[] [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2] sublists.sort() sublists=list(sublists for sublists, _ in itertools.groupby(sublists)) unique_names=[[x[0][0],x[0][2]] for x in sublists] for x in unique_names: flag = 0 for i in range(len(mirna)): if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]: flag+=1 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3] mirna[i]=ktr f = open('shifted_mirnas.bed', "w") for i in range(len(mirna)): if "chr" in mirna[i]: # change the name of current mirna mirna_name_1 = mirna[i].split("=")[3] mirna_name_2 = mirna[i].split("=")[4] mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0] mirna[i] = mirna[i].replace("miRNA", mirna_name_1) # Shift the position of mirna start = mirna[i].split("\t")[3] end = mirna[i].split("\t")[4] shift_start = int(start)-positions # shift the interval to the left shift_end = int(end)+positions # shift the interval to the right # Replace the previous intervals with the new mirna[i] = mirna[i].replace(start, str(shift_start)) mirna[i] = mirna[i].replace(end, str(shift_end)) f.write(mirna[i] + "\n") f.close() #------------------------Extract the sequences of the Custom Arms with getfasta tool------------------------------- def bedtool(genome): subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"]) #=================================================================================================================================== if __name__=='__main__': starttime = time.time() q = Queue() # Read the original gff3 file p1 = Process(target=read_url(q)) p1.start() p1.join() file_mirna=q.get() # Export the original gff3 file p2 = [Process(target=write_gff(file_mirna))] # Create the new gff3 file p2.extend([Process(target=new_gff(file_mirna))]) [x.start() for x in p2] [x.join() for x in p2] # Extract the sequences of the Custom Arms p3 = Process(target=bedtool(args.genome)) p3.start() p3.join() print('Runtime: {} seconds'.format(round(time.time() - starttime,2)))