comparison armdb_mirbase.py @ 0:6c267b9256fa draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:22:58 +0000
parents
children 40c50594116c
comparison
equal deleted inserted replaced
-1:000000000000 0:6c267b9256fa
1 import subprocess
2 import argparse
3 import time
4 import urllib.request
5 from multiprocessing import Process, Queue
6 import itertools
7
8
9
10 #---------------------------------Arguments------------------------------------------
11
12 subprocess.call(['mkdir', 'out'])
13 parser = argparse.ArgumentParser()
14
15 parser.add_argument("-pos", "--positions", help="", action="store")
16 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
17 parser.add_argument("-gen", "--genome", help="tool directory path", action="store")
18 parser.add_argument("-gff3", "--gff", help="",action="store")
19 args = parser.parse_args()
20
21 #-----------------------Download and read of the gff3 file---------------------------------
22
23 def read_url(q):
24
25 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/genomes/'+args.gff
26 data = urllib.request.urlopen(url).read()
27 file_mirna = data.decode('utf-8')
28 file_mirna = file_mirna.split("\n")
29 q.put(file_mirna)
30
31
32 #-----------------------Export of the original gff3 file---------------------------------
33
34 def write_gff(file_mirna):
35 f = open('original_mirnas.bed', "w")
36
37 for i in range(len(file_mirna)):
38 f.write(file_mirna[i] + "\n")
39
40 #------------------------Process and export of the file with mature mirnas-------------------------------
41
42 def new_gff(file_mirna):
43
44 mirna = [] # new list with shifted mirnas
45 positions =int(args.positions) # positions shifted
46 print(str(positions)+" positions shifted")
47 names=[]
48
49 for i in range(len(file_mirna)):
50
51 # Remove lines which conatain the word "primary"
52 if "primary" not in file_mirna[i]:
53 mirna.append(file_mirna[i])
54
55 # Check if the line starts with "chr"
56 if "chr" in file_mirna[i]:
57 a=file_mirna[i].split("\t")[0]
58 b=file_mirna[i].split("\t")[6]
59 c=file_mirna[i].split("=")[3].split(";")[0]
60 names.append([a,b,c])
61
62 names.sort()
63 sublists=[]
64
65 [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2]
66 sublists.sort()
67 sublists=list(sublists for sublists, _ in itertools.groupby(sublists))
68 unique_names=[[x[0][0],x[0][2]] for x in sublists]
69
70 for x in unique_names:
71 flag = 0
72 for i in range(len(mirna)):
73
74 if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]:
75 flag+=1
76 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3]
77 mirna[i]=ktr
78
79
80 f = open('shifted_mirnas.bed', "w")
81
82 for i in range(len(mirna)):
83
84 if "chr" in mirna[i]:
85
86 # change the name of current mirna
87 mirna_name_1 = mirna[i].split("=")[3]
88 mirna_name_2 = mirna[i].split("=")[4]
89 mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0]
90 mirna[i] = mirna[i].replace("miRNA", mirna_name_1)
91
92 # Shift the position of mirna
93 start = mirna[i].split("\t")[3]
94 end = mirna[i].split("\t")[4]
95 shift_start = int(start)-positions # shift the interval to the left
96 shift_end = int(end)+positions # shift the interval to the right
97
98 # Replace the previous intervals with the new
99 mirna[i] = mirna[i].replace(start, str(shift_start))
100 mirna[i] = mirna[i].replace(end, str(shift_end))
101
102 f.write(mirna[i] + "\n")
103
104 f.close()
105
106
107 #------------------------Extract the sequences of the Custom Arms with getfasta tool-------------------------------
108
109 def bedtool(genome):
110 subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"])
111 #===================================================================================================================================
112
113
114 if __name__=='__main__':
115
116 starttime = time.time()
117 q = Queue()
118
119 # Read the original gff3 file
120 p1 = Process(target=read_url(q))
121 p1.start()
122 p1.join()
123 file_mirna=q.get()
124
125 # Export the original gff3 file
126 p2 = [Process(target=write_gff(file_mirna))]
127
128 # Create the new gff3 file
129 p2.extend([Process(target=new_gff(file_mirna))])
130 [x.start() for x in p2]
131 [x.join() for x in p2]
132
133 # Extract the sequences of the Custom Arms
134 p3 = Process(target=bedtool(args.genome))
135 p3.start()
136 p3.join()
137
138 print('The total runtime was {} seconds'.format(time.time() - starttime))
139