0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
|
|
5
|
|
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
|
|
7 -o, output=0: the output pileup
|
|
8 -1, input1=1: the pileup file to start with
|
|
9 -2, input2=2: the second pileup file to join
|
|
10 -g, hinge=h: the columns to be used for matching
|
|
11 -c, columns=c: the columns that should appear in the output
|
|
12 -f, fill_options_file=f: the file specifying the fill value to use
|
|
13 other_inputs: the other input files to join
|
|
14 """
|
|
15
|
|
16 import optparse, os, re, struct, sys, tempfile
|
|
17 from galaxy.util.bunch import Bunch
|
|
18 from galaxy.util import stringify_dictionary_keys
|
|
19 import json
|
|
20
|
|
21 def stop_err( msg ):
|
|
22 sys.stderr.write( msg )
|
|
23 sys.exit()
|
|
24
|
|
25 def split_nums( text ):
|
|
26 """
|
|
27 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
|
|
28 """
|
|
29 split_t = []
|
|
30 c = ''
|
|
31 n = ''
|
|
32 for ch in text:
|
|
33 try:
|
|
34 v = int( ch )
|
|
35 n += ch
|
|
36 if c:
|
|
37 split_t.append( ''.join( c ) )
|
|
38 c = ''
|
|
39 except ValueError:
|
|
40 c += ch
|
|
41 if n:
|
|
42 split_t.append( int( ''.join( n ) ) )
|
|
43 n = ''
|
|
44 if c:
|
|
45 split_t.append( ''.join( c ) )
|
|
46 if n:
|
|
47 split_t.append( int( ''.join( n ) ) )
|
|
48 return split_t
|
|
49
|
|
50 def hinge_compare( hinge1, hinge2 ):
|
|
51 """
|
|
52 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
|
|
53 first part handled as text but last part as number
|
|
54 """
|
|
55 split_hinge1 = hinge1.split( '\t' )
|
|
56 split_hinge2 = hinge2.split( '\t' )
|
|
57 # quick check if either hinge is empty
|
|
58 if not ''.join( split_hinge2 ):
|
|
59 if ''.join( split_hinge1 ):
|
|
60 return 1
|
|
61 elif not ''.join( split_hinge1 ):
|
|
62 return 0
|
|
63 else:
|
|
64 if not ''.join( split_hinge1 ):
|
|
65 return -1
|
|
66 # go through all parts of the hinges and compare
|
|
67 for i, sh1 in enumerate( split_hinge1 ):
|
|
68 # if these hinge segments are the same, just move on to the next ones
|
|
69 if sh1 == split_hinge2[ i ]:
|
|
70 continue
|
|
71 # check all parts of each hinge
|
|
72 h1 = split_nums( sh1 )
|
|
73 h2 = split_nums( split_hinge2[ i ] )
|
|
74 for j, h in enumerate( h1 ):
|
|
75 # if second hinge has no more parts, first is considered larger
|
|
76 if j > 0 and len( h2 ) <= j:
|
|
77 return 1
|
|
78 # if these two parts are the same, move on to next
|
|
79 if h == h2[ j ]:
|
|
80 continue
|
|
81 # do actual comparison, depending on whether letter or number
|
|
82 if type( h ) == int:
|
|
83 if type( h2[ j ] ) == int:
|
|
84 if h > h2[ j ]:
|
|
85 return 1
|
|
86 elif h < h2[ j ]:
|
|
87 return -1
|
|
88 # numbers are less than letters
|
|
89 elif type( h2[ j ] ) == str:
|
|
90 return -1
|
|
91 elif type( h ) == str:
|
|
92 if type( h2[ j ] ) == str:
|
|
93 if h > h2[ j ]:
|
|
94 return 1
|
|
95 elif h < h2[ j ]:
|
|
96 return -1
|
|
97 # numbers are less than letters
|
|
98 elif type( h2[ j ] ) == int:
|
|
99 return 1
|
|
100 # if all else has failed, just do basic string comparison
|
|
101 if hinge1 > hinge2:
|
|
102 return 1
|
|
103 elif hinge1 == hinge2:
|
|
104 return 0
|
|
105 elif hinge1 < hinge2:
|
|
106 return -1
|
|
107
|
|
108 def hinge_sort( infile, outfile, hinge ):
|
|
109 """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
|
|
110 hinge_locs = {}
|
|
111 bad_lines = []
|
|
112 fin = open( infile, 'rb' )
|
|
113 line = fin.readline()
|
|
114 while line.strip():
|
|
115 try:
|
|
116 hinge_parts = line.split( '\t' )[ :hinge ]
|
|
117 try:
|
|
118 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
|
|
119 except KeyError:
|
|
120 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
|
|
121 except ValueError:
|
|
122 bad_line.append( line )
|
|
123 line = fin.readline()
|
|
124 fin.close()
|
|
125 fin = open( infile, 'rb' )
|
|
126 fout = open( outfile, 'wb' )
|
|
127 hinge_locs_sorted = hinge_locs.keys()
|
|
128 hinge_locs_sorted.sort( hinge_compare )
|
|
129 for hinge_loc in hinge_locs_sorted:
|
|
130 locs = hinge_locs[ hinge_loc ]
|
|
131 for loc in locs:
|
|
132 fin.seek( loc )
|
|
133 fout.write( fin.readline() )
|
|
134 fout.close()
|
|
135 fin.close()
|
|
136
|
|
137 def __main__():
|
|
138 parser = optparse.OptionParser()
|
|
139 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
|
|
140 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
|
|
141 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
|
|
142 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
|
|
143 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
|
|
144 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
|
|
145 (options, args) = parser.parse_args()
|
|
146 hinge = int( options.hinge )
|
|
147 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
|
|
148 inputs = [ options.input1, options.input2 ]
|
|
149 if options.fill_options_file == 'None':
|
|
150 inputs.extend( args )
|
|
151 elif len( args ) > 0:
|
|
152 inputs.extend( args )
|
|
153 fill_options = None
|
|
154 if options.fill_options_file != 'None' and options.fill_options_file is not None:
|
|
155 try:
|
|
156 fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) )
|
|
157 except Exception, e:
|
|
158 print 'Warning: Ignoring fill options due to json error (%s).' % e
|
|
159 if fill_options is None:
|
|
160 fill_options = Bunch()
|
|
161 if 'file1_columns' not in fill_options:
|
|
162 fill_options.file1_columns = None
|
|
163 if fill_options and fill_options.file1_columns:
|
|
164 fill_empty = {}
|
|
165 for col in cols:
|
|
166 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
|
|
167 else:
|
|
168 fill_empty = None
|
|
169 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
|
|
170 delimiter = '\t'
|
|
171 # make sure all files are sorted in same way, ascending
|
|
172 tmp_input_files = []
|
|
173 input_files = inputs[:]
|
|
174 for in_file in input_files:
|
|
175 tmp_file = tempfile.NamedTemporaryFile()
|
|
176 tmp_file_name = tmp_file.name
|
|
177 tmp_file.close()
|
|
178 hinge_sort( in_file, tmp_file_name, hinge )
|
|
179 tmp_file = open( tmp_file_name, 'rb' )
|
|
180 tmp_input_files.append( tmp_file )
|
|
181 # cycle through files, getting smallest line of all files one at a time
|
|
182 # also have to keep track of vertical position of extra columns
|
|
183 fout = file( options.output, 'w' )
|
|
184 old_current = ''
|
|
185 first_line = True
|
|
186 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
|
|
187 last_lines = ''.join( current_lines )
|
|
188 last_loc = -1
|
|
189 while last_lines:
|
|
190 # get the "minimum" hinge, which should come first, and the file location in list
|
|
191 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
|
|
192 hinge_dict = {}
|
|
193 for i in range( len( hinges ) ):
|
|
194 if not hinge_dict.has_key( hinges[ i ] ):
|
|
195 hinge_dict[ hinges[ i ] ] = i
|
|
196 hinges.sort( hinge_compare )
|
|
197 hinges = [ h for h in hinges if h ]
|
|
198 current, loc = hinges[0], hinge_dict[ hinges[0] ]
|
|
199 # first output empty columns for vertical alignment (account for "missing" files)
|
|
200 # write output for leading and trailing empty columns
|
|
201 # columns missing from actual file handled further below
|
|
202 current_data = []
|
|
203 if current != old_current:
|
|
204 # fill trailing empty columns with appropriate fill value
|
|
205 if not first_line:
|
|
206 if last_loc < len( inputs ) - 1:
|
|
207 if not fill_empty:
|
|
208 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
|
|
209 else:
|
|
210 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
|
|
211 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
|
|
212 # insert line break before current line
|
|
213 fout.write( '\n' )
|
|
214 # fill leading empty columns with appropriate fill value
|
|
215 if loc > 0:
|
|
216 if not fill_empty:
|
|
217 current_data = [ '' for col in range( loc * len( cols ) ) ]
|
|
218 else:
|
|
219 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
|
|
220 else:
|
|
221 if loc - last_loc > 1:
|
|
222 if not fill_empty:
|
|
223 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
|
|
224 else:
|
|
225 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
|
|
226 # now output actual data
|
|
227 split_line = current_lines[ loc ].split( delimiter )
|
|
228 # fill empties within actual line if appropriate
|
|
229 if fill_empty:
|
|
230 new_split_line = split_line[:]
|
|
231 split_line = []
|
|
232 for i, item in enumerate( new_split_line ):
|
|
233 col = i + 1
|
|
234 if not item:
|
|
235 try:
|
|
236 split_line.append( fill_empty[ i + 1 ] )
|
|
237 except KeyError:
|
|
238 split_line.append( item )
|
|
239 else:
|
|
240 split_line.append( item )
|
|
241 # add actual data to be output below
|
|
242 if ''.join( split_line ):
|
|
243 for col in cols:
|
|
244 if col > hinge:
|
|
245 # if this column doesn't exist, add the appropriate filler or empty column
|
|
246 try:
|
|
247 new_item = split_line[ col - 1 ]
|
|
248 except IndexError:
|
|
249 if fill_empty:
|
|
250 new_item = fill_empty[ col ]
|
|
251 else:
|
|
252 new_item = ''
|
|
253 current_data.append( new_item )
|
|
254 # grab next line for selected file
|
|
255 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
|
|
256 # write relevant data to file
|
|
257 if current == old_current and current_data:
|
|
258 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
|
|
259 elif current_data:
|
|
260 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
|
|
261 last_lines = ''.join( current_lines )
|
|
262 else:
|
|
263 last_lines = None
|
|
264 last_loc = loc
|
|
265 old_current = current
|
|
266 first_line = False
|
|
267 # fill trailing empty columns for final line
|
|
268 if last_loc < len( inputs ) - 1:
|
|
269 if not fill_empty:
|
|
270 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
|
|
271 else:
|
|
272 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
|
|
273 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
|
|
274 fout.write( '\n' )
|
|
275 fout.close()
|
|
276 for f in tmp_input_files:
|
|
277 os.unlink( f.name )
|
|
278
|
|
279 if __name__ == "__main__" : __main__()
|