annotate armdb_mirbase.py @ 15:4c575b6d705c draft

Uploaded
author glogobyte
date Mon, 16 Oct 2023 14:17:27 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
1 import subprocess
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
2 import argparse
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
3 import time
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
4 import urllib.request
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
5 from multiprocessing import Process, Queue
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
6 import itertools
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
7
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
8 #---------------------------------Arguments------------------------------------------
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
9
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
10 subprocess.call(['mkdir', 'out'])
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
11 parser = argparse.ArgumentParser()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
12
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
13 parser.add_argument("-pos", "--positions", help="number of additional nucleotides", action="store")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
14 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
15 parser.add_argument("-gen", "--genome", help="genome version", action="store")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
16 parser.add_argument("-gff3", "--gff", help="gff file",action="store")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
17 args = parser.parse_args()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
18
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
19 #-----------------------Download and read of the gff3 file---------------------------------
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
20
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
21 def read_url(q):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
22
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
23 url = 'https://mirbase.org/download/CURRENT/genomes/'+args.gff
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
24 data = urllib.request.urlopen(url).read()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
25 file_mirna = data.decode('utf-8')
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
26 file_mirna = file_mirna.split("\n")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
27 q.put(file_mirna)
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
28
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
29
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
30 #-----------------------Export of the original gff3 file---------------------------------
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
31
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
32 def write_gff(file_mirna):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
33 f = open('original_mirnas.bed', "w")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
34
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
35 for i in range(len(file_mirna)):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
36 f.write(file_mirna[i] + "\n")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
37
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
38 #------------------------Process and export of the file with mature mirnas-------------------------------
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
39
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
40 def new_gff(file_mirna):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
41
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
42 mirna = [] # new list with shifted mirnas
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
43 positions =int(args.positions) # positions shifted
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
44 print(str(positions)+" positions shifted")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
45 names=[]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
46
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
47 for i in range(len(file_mirna)):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
48
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
49 # Remove lines which conatain the word "primary"
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
50 if "primary" not in file_mirna[i]:
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
51 mirna.append(file_mirna[i])
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
52
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
53 # Check if the line starts with "chr"
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
54 if "chr" in file_mirna[i]:
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
55 a=file_mirna[i].split("\t")[0]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
56 b=file_mirna[i].split("\t")[6]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
57 c=file_mirna[i].split("=")[3].split(";")[0]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
58 names.append([a,b,c])
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
59
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
60 names.sort()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
61 sublists=[]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
62
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
63 [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
64 sublists.sort()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
65 sublists=list(sublists for sublists, _ in itertools.groupby(sublists))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
66 unique_names=[[x[0][0],x[0][2]] for x in sublists]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
67
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
68 for x in unique_names:
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
69 flag = 0
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
70 for i in range(len(mirna)):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
71
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
72 if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]:
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
73 flag+=1
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
74 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
75 mirna[i]=ktr
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
76
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
77
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
78 f = open('shifted_mirnas.bed', "w")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
79
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
80 for i in range(len(mirna)):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
81
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
82 if "chr" in mirna[i]:
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
83
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
84 # change the name of current mirna
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
85 mirna_name_1 = mirna[i].split("=")[3]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
86 mirna_name_2 = mirna[i].split("=")[4]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
87 mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
88 mirna[i] = mirna[i].replace("miRNA", mirna_name_1)
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
89
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
90 # Shift the position of mirna
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
91 start = mirna[i].split("\t")[3]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
92 end = mirna[i].split("\t")[4]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
93 shift_start = int(start)-positions # shift the interval to the left
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
94 shift_end = int(end)+positions # shift the interval to the right
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
95
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
96 # Replace the previous intervals with the new
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
97 mirna[i] = mirna[i].replace(start, str(shift_start))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
98 mirna[i] = mirna[i].replace(end, str(shift_end))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
99
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
100 f.write(mirna[i] + "\n")
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
101
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
102 f.close()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
103
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
104
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
105 #------------------------Extract the sequences of the Custom Arms with getfasta tool-------------------------------
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
106
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
107 def bedtool(genome):
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
108 subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"])
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
109 #===================================================================================================================================
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
110
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
111
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
112 if __name__=='__main__':
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
113
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
114 starttime = time.time()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
115 q = Queue()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
116
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
117 # Read the original gff3 file
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
118 p1 = Process(target=read_url(q))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
119 p1.start()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
120 p1.join()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
121 file_mirna=q.get()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
122
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
123 # Export the original gff3 file
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
124 p2 = [Process(target=write_gff(file_mirna))]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
125
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
126 # Create the new gff3 file
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
127 p2.extend([Process(target=new_gff(file_mirna))])
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
128 [x.start() for x in p2]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
129 [x.join() for x in p2]
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
130
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
131 # Extract the sequences of the Custom Arms
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
132 p3 = Process(target=bedtool(args.genome))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
133 p3.start()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
134 p3.join()
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
135
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
136 print('Runtime: {} seconds'.format(round(time.time() - starttime,2)))
4c575b6d705c Uploaded
glogobyte
parents:
diff changeset
137