comparison tools/new_operations/column_join.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2
3 """
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
7 -o, output=0: the output pileup
8 -1, input1=1: the pileup file to start with
9 -2, input2=2: the second pileup file to join
10 -g, hinge=h: the columns to be used for matching
11 -c, columns=c: the columns that should appear in the output
12 -f, fill_options_file=f: the file specifying the fill value to use
13 other_inputs: the other input files to join
14 """
15
16 import optparse, os, re, struct, sys, tempfile
17
18 try:
19 simple_json_exception = None
20 from galaxy import eggs
21 from galaxy.util.bunch import Bunch
22 from galaxy.util import stringify_dictionary_keys
23 import pkg_resources
24 pkg_resources.require("simplejson")
25 import simplejson
26 except Exception, e:
27 simplejson_exception = e
28 simplejson = None
29
30 def stop_err( msg ):
31 sys.stderr.write( msg )
32 sys.exit()
33
34 def split_nums( text ):
35 """
36 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
37 """
38 split_t = []
39 c = ''
40 n = ''
41 for ch in text:
42 try:
43 v = int( ch )
44 n += ch
45 if c:
46 split_t.append( ''.join( c ) )
47 c = ''
48 except ValueError:
49 c += ch
50 if n:
51 split_t.append( int( ''.join( n ) ) )
52 n = ''
53 if c:
54 split_t.append( ''.join( c ) )
55 if n:
56 split_t.append( int( ''.join( n ) ) )
57 return split_t
58
59 def hinge_compare( hinge1, hinge2 ):
60 """
61 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
62 first part handled as text but last part as number
63 """
64 split_hinge1 = hinge1.split( '\t' )
65 split_hinge2 = hinge2.split( '\t' )
66 # quick check if either hinge is empty
67 if not ''.join( split_hinge2 ):
68 if ''.join( split_hinge1 ):
69 return 1
70 elif not ''.join( split_hinge1 ):
71 return 0
72 else:
73 if not ''.join( split_hinge1 ):
74 return -1
75 # go through all parts of the hinges and compare
76 for i, sh1 in enumerate( split_hinge1 ):
77 # if these hinge segments are the same, just move on to the next ones
78 if sh1 == split_hinge2[ i ]:
79 continue
80 # check all parts of each hinge
81 h1 = split_nums( sh1 )
82 h2 = split_nums( split_hinge2[ i ] )
83 for j, h in enumerate( h1 ):
84 # if second hinge has no more parts, first is considered larger
85 if j > 0 and len( h2 ) <= j:
86 return 1
87 # if these two parts are the same, move on to next
88 if h == h2[ j ]:
89 continue
90 # do actual comparison, depending on whether letter or number
91 if type( h ) == int:
92 if type( h2[ j ] ) == int:
93 if h > h2[ j ]:
94 return 1
95 elif h < h2[ j ]:
96 return -1
97 # numbers are less than letters
98 elif type( h2[ j ] ) == str:
99 return -1
100 elif type( h ) == str:
101 if type( h2[ j ] ) == str:
102 if h > h2[ j ]:
103 return 1
104 elif h < h2[ j ]:
105 return -1
106 # numbers are less than letters
107 elif type( h2[ j ] ) == int:
108 return 1
109 # if all else has failed, just do basic string comparison
110 if hinge1 > hinge2:
111 return 1
112 elif hinge1 == hinge2:
113 return 0
114 elif hinge1 < hinge2:
115 return -1
116
117 def hinge_sort( infile, outfile, hinge ):
118 """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
119 hinge_locs = {}
120 bad_lines = []
121 fin = open( infile, 'rb' )
122 line = fin.readline()
123 while line.strip():
124 try:
125 hinge_parts = line.split( '\t' )[ :hinge ]
126 try:
127 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
128 except KeyError:
129 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
130 except ValueError:
131 bad_line.append( line )
132 line = fin.readline()
133 fin.close()
134 fin = open( infile, 'rb' )
135 fout = open( outfile, 'wb' )
136 hinge_locs_sorted = hinge_locs.keys()
137 hinge_locs_sorted.sort( hinge_compare )
138 for hinge_loc in hinge_locs_sorted:
139 locs = hinge_locs[ hinge_loc ]
140 for loc in locs:
141 fin.seek( loc )
142 fout.write( fin.readline() )
143 fout.close()
144 fin.close()
145
146 def __main__():
147 parser = optparse.OptionParser()
148 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
149 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
150 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
151 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
152 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
153 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
154 (options, args) = parser.parse_args()
155 hinge = int( options.hinge )
156 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
157 inputs = [ options.input1, options.input2 ]
158 if options.fill_options_file == 'None':
159 inputs.extend( args )
160 elif len( args ) > 0:
161 inputs.extend( args )
162 fill_options = None
163 if options.fill_options_file != 'None' and options.fill_options_file is not None:
164 try:
165 if simplejson is None:
166 raise simplejson_exception
167 fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) )
168 except Exception, e:
169 print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
170 if fill_options is None:
171 fill_options = Bunch()
172 if 'file1_columns' not in fill_options:
173 fill_options.file1_columns = None
174 if fill_options and fill_options.file1_columns:
175 fill_empty = {}
176 for col in cols:
177 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
178 else:
179 fill_empty = None
180 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
181 delimiter = '\t'
182 # make sure all files are sorted in same way, ascending
183 tmp_input_files = []
184 input_files = inputs[:]
185 for in_file in input_files:
186 tmp_file = tempfile.NamedTemporaryFile()
187 tmp_file_name = tmp_file.name
188 tmp_file.close()
189 hinge_sort( in_file, tmp_file_name, hinge )
190 tmp_file = open( tmp_file_name, 'rb' )
191 tmp_input_files.append( tmp_file )
192 # cycle through files, getting smallest line of all files one at a time
193 # also have to keep track of vertical position of extra columns
194 fout = file( options.output, 'w' )
195 old_current = ''
196 first_line = True
197 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
198 last_lines = ''.join( current_lines )
199 last_loc = -1
200 while last_lines:
201 # get the "minimum" hinge, which should come first, and the file location in list
202 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
203 hinge_dict = {}
204 for i in range( len( hinges ) ):
205 if not hinge_dict.has_key( hinges[ i ] ):
206 hinge_dict[ hinges[ i ] ] = i
207 hinges.sort( hinge_compare )
208 hinges = [ h for h in hinges if h ]
209 current, loc = hinges[0], hinge_dict[ hinges[0] ]
210 # first output empty columns for vertical alignment (account for "missing" files)
211 # write output for leading and trailing empty columns
212 # columns missing from actual file handled further below
213 current_data = []
214 if current != old_current:
215 # fill trailing empty columns with appropriate fill value
216 if not first_line:
217 if last_loc < len( inputs ) - 1:
218 if not fill_empty:
219 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
220 else:
221 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
222 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
223 # insert line break before current line
224 fout.write( '\n' )
225 # fill leading empty columns with appropriate fill value
226 if loc > 0:
227 if not fill_empty:
228 current_data = [ '' for col in range( loc * len( cols ) ) ]
229 else:
230 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
231 else:
232 if loc - last_loc > 1:
233 if not fill_empty:
234 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
235 else:
236 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
237 # now output actual data
238 split_line = current_lines[ loc ].split( delimiter )
239 # fill empties within actual line if appropriate
240 if fill_empty:
241 new_split_line = split_line[:]
242 split_line = []
243 for i, item in enumerate( new_split_line ):
244 col = i + 1
245 if not item:
246 try:
247 split_line.append( fill_empty[ i + 1 ] )
248 except KeyError:
249 split_line.append( item )
250 else:
251 split_line.append( item )
252 # add actual data to be output below
253 if ''.join( split_line ):
254 for col in cols:
255 if col > hinge:
256 # if this column doesn't exist, add the appropriate filler or empty column
257 try:
258 new_item = split_line[ col - 1 ]
259 except IndexError:
260 if fill_empty:
261 new_item = fill_empty[ col ]
262 else:
263 new_item = ''
264 current_data.append( new_item )
265 # grab next line for selected file
266 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
267 # write relevant data to file
268 if current == old_current and current_data:
269 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
270 elif current_data:
271 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
272 last_lines = ''.join( current_lines )
273 else:
274 last_lines = None
275 last_loc = loc
276 old_current = current
277 first_line = False
278 # fill trailing empty columns for final line
279 if last_loc < len( inputs ) - 1:
280 if not fill_empty:
281 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
282 else:
283 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
284 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
285 fout.write( '\n' )
286 fout.close()
287 for f in tmp_input_files:
288 os.unlink( f.name )
289
290 if __name__ == "__main__" : __main__()