11
|
1 import subprocess
|
|
2 import argparse
|
|
3 import time
|
|
4 import urllib.request
|
|
5 from multiprocessing import Process, Queue
|
|
6 import itertools
|
|
7
|
|
8 #---------------------------------Arguments------------------------------------------
|
|
9
|
|
10 subprocess.call(['mkdir', 'out'])
|
|
11 parser = argparse.ArgumentParser()
|
|
12
|
|
13 parser.add_argument("-pos", "--positions", help="number of additional nucleotides", action="store")
|
|
14 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
|
|
15 parser.add_argument("-gen", "--genome", help="genome version", action="store")
|
|
16 parser.add_argument("-gff3", "--gff", help="gff file",action="store")
|
|
17 args = parser.parse_args()
|
|
18
|
|
19 #-----------------------Download and read of the gff3 file---------------------------------
|
|
20
|
|
21 def read_url(q):
|
|
22
|
|
23 url = 'https://www.mirbase.org/ftp/CURRENT/genomes/'+args.gff
|
|
24 data = urllib.request.urlopen(url).read()
|
|
25 file_mirna = data.decode('utf-8')
|
|
26 file_mirna = file_mirna.split("\n")
|
|
27 q.put(file_mirna)
|
|
28
|
|
29
|
|
30 #-----------------------Export of the original gff3 file---------------------------------
|
|
31
|
|
32 def write_gff(file_mirna):
|
|
33 f = open('original_mirnas.bed', "w")
|
|
34
|
|
35 for i in range(len(file_mirna)):
|
|
36 f.write(file_mirna[i] + "\n")
|
|
37
|
|
38 #------------------------Process and export of the file with mature mirnas-------------------------------
|
|
39
|
|
40 def new_gff(file_mirna):
|
|
41
|
|
42 mirna = [] # new list with shifted mirnas
|
|
43 positions =int(args.positions) # positions shifted
|
|
44 print(str(positions)+" positions shifted")
|
|
45 names=[]
|
|
46
|
|
47 for i in range(len(file_mirna)):
|
|
48
|
|
49 # Remove lines which conatain the word "primary"
|
|
50 if "primary" not in file_mirna[i]:
|
|
51 mirna.append(file_mirna[i])
|
|
52
|
|
53 # Check if the line starts with "chr"
|
|
54 if "chr" in file_mirna[i]:
|
|
55 a=file_mirna[i].split("\t")[0]
|
|
56 b=file_mirna[i].split("\t")[6]
|
|
57 c=file_mirna[i].split("=")[3].split(";")[0]
|
|
58 names.append([a,b,c])
|
|
59
|
|
60 names.sort()
|
|
61 sublists=[]
|
|
62
|
|
63 [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2]
|
|
64 sublists.sort()
|
|
65 sublists=list(sublists for sublists, _ in itertools.groupby(sublists))
|
|
66 unique_names=[[x[0][0],x[0][2]] for x in sublists]
|
|
67
|
|
68 for x in unique_names:
|
|
69 flag = 0
|
|
70 for i in range(len(mirna)):
|
|
71
|
|
72 if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]:
|
|
73 flag+=1
|
|
74 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3]
|
|
75 mirna[i]=ktr
|
|
76
|
|
77
|
|
78 f = open('shifted_mirnas.bed', "w")
|
|
79
|
|
80 for i in range(len(mirna)):
|
|
81
|
|
82 if "chr" in mirna[i]:
|
|
83
|
|
84 # change the name of current mirna
|
|
85 mirna_name_1 = mirna[i].split("=")[3]
|
|
86 mirna_name_2 = mirna[i].split("=")[4]
|
|
87 mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0]
|
|
88 mirna[i] = mirna[i].replace("miRNA", mirna_name_1)
|
|
89
|
|
90 # Shift the position of mirna
|
|
91 start = mirna[i].split("\t")[3]
|
|
92 end = mirna[i].split("\t")[4]
|
|
93 shift_start = int(start)-positions # shift the interval to the left
|
|
94 shift_end = int(end)+positions # shift the interval to the right
|
|
95
|
|
96 # Replace the previous intervals with the new
|
|
97 mirna[i] = mirna[i].replace(start, str(shift_start))
|
|
98 mirna[i] = mirna[i].replace(end, str(shift_end))
|
|
99
|
|
100 f.write(mirna[i] + "\n")
|
|
101
|
|
102 f.close()
|
|
103
|
|
104
|
|
105 #------------------------Extract the sequences of the Custom Arms with getfasta tool-------------------------------
|
|
106
|
|
107 def bedtool(genome):
|
|
108 subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"])
|
|
109 #===================================================================================================================================
|
|
110
|
|
111
|
|
112 if __name__=='__main__':
|
|
113
|
|
114 starttime = time.time()
|
|
115 q = Queue()
|
|
116
|
|
117 # Read the original gff3 file
|
|
118 p1 = Process(target=read_url(q))
|
|
119 p1.start()
|
|
120 p1.join()
|
|
121 file_mirna=q.get()
|
|
122
|
|
123 # Export the original gff3 file
|
|
124 p2 = [Process(target=write_gff(file_mirna))]
|
|
125
|
|
126 # Create the new gff3 file
|
|
127 p2.extend([Process(target=new_gff(file_mirna))])
|
|
128 [x.start() for x in p2]
|
|
129 [x.join() for x in p2]
|
|
130
|
|
131 # Extract the sequences of the Custom Arms
|
|
132 p3 = Process(target=bedtool(args.genome))
|
|
133 p3.start()
|
|
134 p3.join()
|
|
135
|
|
136 print('Runtime: {} seconds'.format(round(time.time() - starttime,2)))
|
|
137
|