Mercurial > repos > glogobyte > armdb
comparison armdb_mirbase.py @ 0:6c267b9256fa draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:22:58 +0000 |
parents | |
children | 40c50594116c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6c267b9256fa |
---|---|
1 import subprocess | |
2 import argparse | |
3 import time | |
4 import urllib.request | |
5 from multiprocessing import Process, Queue | |
6 import itertools | |
7 | |
8 | |
9 | |
10 #---------------------------------Arguments------------------------------------------ | |
11 | |
12 subprocess.call(['mkdir', 'out']) | |
13 parser = argparse.ArgumentParser() | |
14 | |
15 parser.add_argument("-pos", "--positions", help="", action="store") | |
16 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") | |
17 parser.add_argument("-gen", "--genome", help="tool directory path", action="store") | |
18 parser.add_argument("-gff3", "--gff", help="",action="store") | |
19 args = parser.parse_args() | |
20 | |
21 #-----------------------Download and read of the gff3 file--------------------------------- | |
22 | |
23 def read_url(q): | |
24 | |
25 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/genomes/'+args.gff | |
26 data = urllib.request.urlopen(url).read() | |
27 file_mirna = data.decode('utf-8') | |
28 file_mirna = file_mirna.split("\n") | |
29 q.put(file_mirna) | |
30 | |
31 | |
32 #-----------------------Export of the original gff3 file--------------------------------- | |
33 | |
34 def write_gff(file_mirna): | |
35 f = open('original_mirnas.bed', "w") | |
36 | |
37 for i in range(len(file_mirna)): | |
38 f.write(file_mirna[i] + "\n") | |
39 | |
40 #------------------------Process and export of the file with mature mirnas------------------------------- | |
41 | |
42 def new_gff(file_mirna): | |
43 | |
44 mirna = [] # new list with shifted mirnas | |
45 positions =int(args.positions) # positions shifted | |
46 print(str(positions)+" positions shifted") | |
47 names=[] | |
48 | |
49 for i in range(len(file_mirna)): | |
50 | |
51 # Remove lines which conatain the word "primary" | |
52 if "primary" not in file_mirna[i]: | |
53 mirna.append(file_mirna[i]) | |
54 | |
55 # Check if the line starts with "chr" | |
56 if "chr" in file_mirna[i]: | |
57 a=file_mirna[i].split("\t")[0] | |
58 b=file_mirna[i].split("\t")[6] | |
59 c=file_mirna[i].split("=")[3].split(";")[0] | |
60 names.append([a,b,c]) | |
61 | |
62 names.sort() | |
63 sublists=[] | |
64 | |
65 [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2] | |
66 sublists.sort() | |
67 sublists=list(sublists for sublists, _ in itertools.groupby(sublists)) | |
68 unique_names=[[x[0][0],x[0][2]] for x in sublists] | |
69 | |
70 for x in unique_names: | |
71 flag = 0 | |
72 for i in range(len(mirna)): | |
73 | |
74 if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]: | |
75 flag+=1 | |
76 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3] | |
77 mirna[i]=ktr | |
78 | |
79 | |
80 f = open('shifted_mirnas.bed', "w") | |
81 | |
82 for i in range(len(mirna)): | |
83 | |
84 if "chr" in mirna[i]: | |
85 | |
86 # change the name of current mirna | |
87 mirna_name_1 = mirna[i].split("=")[3] | |
88 mirna_name_2 = mirna[i].split("=")[4] | |
89 mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0] | |
90 mirna[i] = mirna[i].replace("miRNA", mirna_name_1) | |
91 | |
92 # Shift the position of mirna | |
93 start = mirna[i].split("\t")[3] | |
94 end = mirna[i].split("\t")[4] | |
95 shift_start = int(start)-positions # shift the interval to the left | |
96 shift_end = int(end)+positions # shift the interval to the right | |
97 | |
98 # Replace the previous intervals with the new | |
99 mirna[i] = mirna[i].replace(start, str(shift_start)) | |
100 mirna[i] = mirna[i].replace(end, str(shift_end)) | |
101 | |
102 f.write(mirna[i] + "\n") | |
103 | |
104 f.close() | |
105 | |
106 | |
107 #------------------------Extract the sequences of the Custom Arms with getfasta tool------------------------------- | |
108 | |
109 def bedtool(genome): | |
110 subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"]) | |
111 #=================================================================================================================================== | |
112 | |
113 | |
114 if __name__=='__main__': | |
115 | |
116 starttime = time.time() | |
117 q = Queue() | |
118 | |
119 # Read the original gff3 file | |
120 p1 = Process(target=read_url(q)) | |
121 p1.start() | |
122 p1.join() | |
123 file_mirna=q.get() | |
124 | |
125 # Export the original gff3 file | |
126 p2 = [Process(target=write_gff(file_mirna))] | |
127 | |
128 # Create the new gff3 file | |
129 p2.extend([Process(target=new_gff(file_mirna))]) | |
130 [x.start() for x in p2] | |
131 [x.join() for x in p2] | |
132 | |
133 # Extract the sequences of the Custom Arms | |
134 p3 = Process(target=bedtool(args.genome)) | |
135 p3.start() | |
136 p3.join() | |
137 | |
138 print('The total runtime was {} seconds'.format(time.time() - starttime)) | |
139 |