comparison scaffremodler/group4contig.py @ 0:66885fa414c8 draft default tip

Uploaded
author gdroc
date Mon, 14 Nov 2016 08:31:23 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:66885fa414c8
1
2 #
3 # Copyright 2014 CIRAD
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, see <http://www.gnu.org/licenses/> or
17 # write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
20 #
21 #
22
23 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator, time
24
25
26
27 def __main__():
28 #Parse Command Line
29 parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n"
30 "This script takes scaffold to join and group them by linkage. The input is a tabulated file looking as folowed:\n"
31 "scaffold1 1 2458028 FWD scaffold2 1 1 FWD FWD contig\n"
32 "scaffold36 1 250001 FWD scaffold5 2000000 2000000 FWD REV contig")
33 # Wrapper options.
34 parser.add_option( '', '--table', dest='table', help='The table file input')
35 parser.add_option( '', '--out', dest='out', default='intermediate_junction.txt', help='The output file name, [default: %default]')
36 (options, args) = parser.parse_args()
37
38 # Filling dictionnary
39 dico_line = set()
40 file = open(options.table)
41 for line in file:
42 data = line.split()
43 if data:
44 if data[3] != 'FWD' or data[7] != 'FWD':
45 sys.exit('Warning! the orientation filled in column 4 or 8 is not managed, this column should contain "FWD" ')
46 dico_line.add(line)
47 file.close()
48
49 dico = set() # --> to identify already treated lines
50 dico_scaff = {} # --> a hash table containing a list of three elements : (1) grouped scaffold ordered, (2) scaffold orientation in the group, (3) ungrouped scaffolds, (4) scaffold number
51 i = 0
52 for line in dico_line:
53 data = line.split()
54 if data:
55 if not(line in dico): # --> this line start a new group
56 i += 1
57 # print i
58 dico.add(line) # --> record a line already treated
59 dico_scaff[i] = [[data[4]],[data[7]],set(), 1]
60 taille_dico = int(dico_scaff[i][3]) # --> to define a group is inflating or not (in the while loop)
61 if data[5] == '1':# --> scaffold should be inserted at the begining of the list
62 dico_scaff[i][0].insert(0,data[0])
63 dico_scaff[i][1].insert(0,data[8])
64 dico_scaff[i][3] += 1
65 else:
66 dico_scaff[i][0].append(data[0])
67 dico_scaff[i][1].append(data[8])
68 dico_scaff[i][3] += 1
69 # --> The new group as been stared
70 while taille_dico < dico_scaff[i][3]: # --> verification is new scaffolds have been added to group
71 taille_dico = int(dico_scaff[i][3])
72 # print dico_scaff[i][0], dico_scaff[i][1], len(dico_scaff[i][2]), dico_scaff[i][3]
73 for line2 in dico_line:
74 data2 = line2.split()
75 if data2 and not(line2 in dico):
76 if data2[4] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group
77 dico.add(line2)
78 dico_scaff[i][3] += 1
79 if data2[4] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list
80 if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated
81 if data2[5] == '1': # --> the new scaffold could not be inserted
82 dico_scaff[i][2].add(line2)
83 # print 'toto1'
84 else: # --> the new scaffold could be inserted at the end
85 dico_scaff[i][0].append(data2[0])
86 dico_scaff[i][1].append(data2[8])
87 elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated
88 if data2[5] != '1': # --> the new scaffold could not be inserted
89 dico_scaff[i][2].add(line2)
90 # print 'toto2'
91 else: # --> the new scaffold could be inserted at the end but its orientation should be reverted
92 dico_scaff[i][0].append(data2[0])
93 if data2[8] == 'FWD':
94 dico_scaff[i][1].append('REV')
95 elif data2[8] == 'REV':
96 dico_scaff[i][1].append('FWD')
97 else:
98 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
99 else:
100 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
101 elif data2[4] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list
102 if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated
103 if data2[5] == '1': # --> the new scaffold could be inserted at the begining
104 dico_scaff[i][0].insert(0,data2[0])
105 dico_scaff[i][1].insert(0,data2[8])
106 else:# --> the new scaffold could not be inserted
107 dico_scaff[i][2].add(line2)
108 # print 'toto3'
109 elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated
110 if data2[5] == '1': # --> the new scaffold could not be inserted
111 dico_scaff[i][2].add(line2)
112 # print 'toto4'
113 else: # --> the new scaffold could be inserted at the begining but its orientation should be reverted
114 dico_scaff[i][0].insert(0,data2[0])
115 if data2[8] == 'FWD':
116 dico_scaff[i][1].insert(0,'REV')
117 elif data2[8] == 'REV':
118 dico_scaff[i][1].insert(0,'FWD')
119 else:
120 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
121 else:
122 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
123 else:
124 dico_scaff[i][2].add(line2)
125 # print 'toto5'
126 elif data2[0] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group
127 dico.add(line2)
128 dico_scaff[i][3] += 1
129 if data2[0] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list
130 if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated
131 if data2[5] == '1':
132 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end
133 dico_scaff[i][0].append(data2[4])
134 dico_scaff[i][1].append(data2[7])
135 elif data2[8] == 'REV': # --> the new scaffold could not be inserted
136 dico_scaff[i][2].add(line2)
137 # print 'toto6'
138 else:
139 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
140 else:
141 if data2[8] == 'REV': # --> the new scaffold could be inserted at the end
142 dico_scaff[i][0].append(data2[4])
143 dico_scaff[i][1].append('REV')
144 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted
145 dico_scaff[i][2].add(line2)
146 # print 'toto7'
147 else:
148 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
149 elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated
150 if data2[5] == '1':
151 if data2[8] == 'REV': # --> the new scaffold could be inserted at the end
152 dico_scaff[i][0].append(data2[4])
153 dico_scaff[i][1].append(data2[7])
154 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted
155 dico_scaff[i][2].add(line2)
156 # print 'toto8'
157 else:
158 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
159 else:
160 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end
161 dico_scaff[i][0].append(data2[4])
162 dico_scaff[i][1].append('REV')
163 elif data2[8] == 'REV': # --> the new scaffold could not be inserted
164 dico_scaff[i][2].add(line2)
165 # print 'toto9'
166 else:
167 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
168 else:
169 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
170 elif data2[0] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list
171 if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated
172 if data2[5] != '1':
173 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining
174 dico_scaff[i][0].insert(0,data2[4])
175 dico_scaff[i][1].insert(0,data2[7])
176 elif data2[8] == 'REV': # --> the new scaffold could not be inserted
177 dico_scaff[i][2].add(line2)
178 # print 'toto10'
179 else:
180 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
181 else:
182 if data2[8] == 'REV': # --> the new scaffold could be inserted at begining
183 dico_scaff[i][0].insert(0,data2[4])
184 dico_scaff[i][1].insert(0,'REV')
185 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted
186 dico_scaff[i][2].add(line2)
187 # print 'toto11'
188 else:
189 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
190 elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated
191 if data2[5] == '1':
192 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining
193 dico_scaff[i][0].insert(0,data2[4])
194 dico_scaff[i][1].insert(0,'REV')
195 elif data2[8] == 'REV': # --> the new scaffold could not be inserted
196 dico_scaff[i][2].add(line2)
197 # print 'toto12'
198 else:
199 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
200 else:
201 if data2[8] == 'REV': # --> the new scaffold could be inserted at begining
202 dico_scaff[i][0].insert(0,data2[4])
203 dico_scaff[i][1].insert(0,data2[7])
204 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted
205 dico_scaff[i][2].add(line2)
206 # print 'toto13'
207 else:
208 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
209 else:
210 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized')
211 else:
212 dico_scaff[i][2].add(line2)
213 # print 'toto14'
214
215
216 outfile = open(options.out,'w')
217 for n in dico_scaff:
218 liste = list(dico_scaff[n][0])
219 liste.sort()
220 outfile.write('>'+liste[0]+'\n')
221 # print '>'+liste[0]
222 for k in dico_scaff[n][0]:
223 outfile.write('\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]])+'\n')
224 # print '\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]])
225 for k in dico_scaff[n][2]:
226 data = k.split()
227 if data[5] == '1':
228 if data[3] == data[8]:
229 outfile.write('not_grouped\t'+data[0]+'\tFWD\t'+data[4]+'\t'+data[7]+'\n')
230 else:
231 outfile.write('not_grouped\t'+data[0]+'\tREV\t'+data[4]+'\t'+data[7]+'\n')
232 else:
233 if data[3] == data[8]:
234 outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tFWD'+'\n')
235 else:
236 outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tREV'+'\n')
237 outfile.close()
238
239
240
241 if __name__ == "__main__": __main__()