Mercurial > repos > xuebing > sharplabtool
comparison tools/new_operations/column_join.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. | |
5 | |
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]] | |
7 -o, output=0: the output pileup | |
8 -1, input1=1: the pileup file to start with | |
9 -2, input2=2: the second pileup file to join | |
10 -g, hinge=h: the columns to be used for matching | |
11 -c, columns=c: the columns that should appear in the output | |
12 -f, fill_options_file=f: the file specifying the fill value to use | |
13 other_inputs: the other input files to join | |
14 """ | |
15 | |
16 import optparse, os, re, struct, sys, tempfile | |
17 | |
18 try: | |
19 simple_json_exception = None | |
20 from galaxy import eggs | |
21 from galaxy.util.bunch import Bunch | |
22 from galaxy.util import stringify_dictionary_keys | |
23 import pkg_resources | |
24 pkg_resources.require("simplejson") | |
25 import simplejson | |
26 except Exception, e: | |
27 simplejson_exception = e | |
28 simplejson = None | |
29 | |
30 def stop_err( msg ): | |
31 sys.stderr.write( msg ) | |
32 sys.exit() | |
33 | |
34 def split_nums( text ): | |
35 """ | |
36 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ] | |
37 """ | |
38 split_t = [] | |
39 c = '' | |
40 n = '' | |
41 for ch in text: | |
42 try: | |
43 v = int( ch ) | |
44 n += ch | |
45 if c: | |
46 split_t.append( ''.join( c ) ) | |
47 c = '' | |
48 except ValueError: | |
49 c += ch | |
50 if n: | |
51 split_t.append( int( ''.join( n ) ) ) | |
52 n = '' | |
53 if c: | |
54 split_t.append( ''.join( c ) ) | |
55 if n: | |
56 split_t.append( int( ''.join( n ) ) ) | |
57 return split_t | |
58 | |
59 def hinge_compare( hinge1, hinge2 ): | |
60 """ | |
61 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that | |
62 first part handled as text but last part as number | |
63 """ | |
64 split_hinge1 = hinge1.split( '\t' ) | |
65 split_hinge2 = hinge2.split( '\t' ) | |
66 # quick check if either hinge is empty | |
67 if not ''.join( split_hinge2 ): | |
68 if ''.join( split_hinge1 ): | |
69 return 1 | |
70 elif not ''.join( split_hinge1 ): | |
71 return 0 | |
72 else: | |
73 if not ''.join( split_hinge1 ): | |
74 return -1 | |
75 # go through all parts of the hinges and compare | |
76 for i, sh1 in enumerate( split_hinge1 ): | |
77 # if these hinge segments are the same, just move on to the next ones | |
78 if sh1 == split_hinge2[ i ]: | |
79 continue | |
80 # check all parts of each hinge | |
81 h1 = split_nums( sh1 ) | |
82 h2 = split_nums( split_hinge2[ i ] ) | |
83 for j, h in enumerate( h1 ): | |
84 # if second hinge has no more parts, first is considered larger | |
85 if j > 0 and len( h2 ) <= j: | |
86 return 1 | |
87 # if these two parts are the same, move on to next | |
88 if h == h2[ j ]: | |
89 continue | |
90 # do actual comparison, depending on whether letter or number | |
91 if type( h ) == int: | |
92 if type( h2[ j ] ) == int: | |
93 if h > h2[ j ]: | |
94 return 1 | |
95 elif h < h2[ j ]: | |
96 return -1 | |
97 # numbers are less than letters | |
98 elif type( h2[ j ] ) == str: | |
99 return -1 | |
100 elif type( h ) == str: | |
101 if type( h2[ j ] ) == str: | |
102 if h > h2[ j ]: | |
103 return 1 | |
104 elif h < h2[ j ]: | |
105 return -1 | |
106 # numbers are less than letters | |
107 elif type( h2[ j ] ) == int: | |
108 return 1 | |
109 # if all else has failed, just do basic string comparison | |
110 if hinge1 > hinge2: | |
111 return 1 | |
112 elif hinge1 == hinge2: | |
113 return 0 | |
114 elif hinge1 < hinge2: | |
115 return -1 | |
116 | |
117 def hinge_sort( infile, outfile, hinge ): | |
118 """Given input file name, sorts logically (text vs. numeric) into provided output file name.""" | |
119 hinge_locs = {} | |
120 bad_lines = [] | |
121 fin = open( infile, 'rb' ) | |
122 line = fin.readline() | |
123 while line.strip(): | |
124 try: | |
125 hinge_parts = line.split( '\t' )[ :hinge ] | |
126 try: | |
127 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) ) | |
128 except KeyError: | |
129 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ] | |
130 except ValueError: | |
131 bad_line.append( line ) | |
132 line = fin.readline() | |
133 fin.close() | |
134 fin = open( infile, 'rb' ) | |
135 fout = open( outfile, 'wb' ) | |
136 hinge_locs_sorted = hinge_locs.keys() | |
137 hinge_locs_sorted.sort( hinge_compare ) | |
138 for hinge_loc in hinge_locs_sorted: | |
139 locs = hinge_locs[ hinge_loc ] | |
140 for loc in locs: | |
141 fin.seek( loc ) | |
142 fout.write( fin.readline() ) | |
143 fout.close() | |
144 fin.close() | |
145 | |
146 def __main__(): | |
147 parser = optparse.OptionParser() | |
148 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' ) | |
149 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' ) | |
150 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' ) | |
151 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' ) | |
152 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' ) | |
153 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' ) | |
154 (options, args) = parser.parse_args() | |
155 hinge = int( options.hinge ) | |
156 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ] | |
157 inputs = [ options.input1, options.input2 ] | |
158 if options.fill_options_file == 'None': | |
159 inputs.extend( args ) | |
160 elif len( args ) > 0: | |
161 inputs.extend( args ) | |
162 fill_options = None | |
163 if options.fill_options_file != 'None' and options.fill_options_file is not None: | |
164 try: | |
165 if simplejson is None: | |
166 raise simplejson_exception | |
167 fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) | |
168 except Exception, e: | |
169 print 'Warning: Ignoring fill options due to simplejson error (%s).' % e | |
170 if fill_options is None: | |
171 fill_options = Bunch() | |
172 if 'file1_columns' not in fill_options: | |
173 fill_options.file1_columns = None | |
174 if fill_options and fill_options.file1_columns: | |
175 fill_empty = {} | |
176 for col in cols: | |
177 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ] | |
178 else: | |
179 fill_empty = None | |
180 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge' | |
181 delimiter = '\t' | |
182 # make sure all files are sorted in same way, ascending | |
183 tmp_input_files = [] | |
184 input_files = inputs[:] | |
185 for in_file in input_files: | |
186 tmp_file = tempfile.NamedTemporaryFile() | |
187 tmp_file_name = tmp_file.name | |
188 tmp_file.close() | |
189 hinge_sort( in_file, tmp_file_name, hinge ) | |
190 tmp_file = open( tmp_file_name, 'rb' ) | |
191 tmp_input_files.append( tmp_file ) | |
192 # cycle through files, getting smallest line of all files one at a time | |
193 # also have to keep track of vertical position of extra columns | |
194 fout = file( options.output, 'w' ) | |
195 old_current = '' | |
196 first_line = True | |
197 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ] | |
198 last_lines = ''.join( current_lines ) | |
199 last_loc = -1 | |
200 while last_lines: | |
201 # get the "minimum" hinge, which should come first, and the file location in list | |
202 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ] | |
203 hinge_dict = {} | |
204 for i in range( len( hinges ) ): | |
205 if not hinge_dict.has_key( hinges[ i ] ): | |
206 hinge_dict[ hinges[ i ] ] = i | |
207 hinges.sort( hinge_compare ) | |
208 hinges = [ h for h in hinges if h ] | |
209 current, loc = hinges[0], hinge_dict[ hinges[0] ] | |
210 # first output empty columns for vertical alignment (account for "missing" files) | |
211 # write output for leading and trailing empty columns | |
212 # columns missing from actual file handled further below | |
213 current_data = [] | |
214 if current != old_current: | |
215 # fill trailing empty columns with appropriate fill value | |
216 if not first_line: | |
217 if last_loc < len( inputs ) - 1: | |
218 if not fill_empty: | |
219 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] | |
220 else: | |
221 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] | |
222 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) ) | |
223 # insert line break before current line | |
224 fout.write( '\n' ) | |
225 # fill leading empty columns with appropriate fill value | |
226 if loc > 0: | |
227 if not fill_empty: | |
228 current_data = [ '' for col in range( loc * len( cols ) ) ] | |
229 else: | |
230 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ] | |
231 else: | |
232 if loc - last_loc > 1: | |
233 if not fill_empty: | |
234 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ] | |
235 else: | |
236 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ] | |
237 # now output actual data | |
238 split_line = current_lines[ loc ].split( delimiter ) | |
239 # fill empties within actual line if appropriate | |
240 if fill_empty: | |
241 new_split_line = split_line[:] | |
242 split_line = [] | |
243 for i, item in enumerate( new_split_line ): | |
244 col = i + 1 | |
245 if not item: | |
246 try: | |
247 split_line.append( fill_empty[ i + 1 ] ) | |
248 except KeyError: | |
249 split_line.append( item ) | |
250 else: | |
251 split_line.append( item ) | |
252 # add actual data to be output below | |
253 if ''.join( split_line ): | |
254 for col in cols: | |
255 if col > hinge: | |
256 # if this column doesn't exist, add the appropriate filler or empty column | |
257 try: | |
258 new_item = split_line[ col - 1 ] | |
259 except IndexError: | |
260 if fill_empty: | |
261 new_item = fill_empty[ col ] | |
262 else: | |
263 new_item = '' | |
264 current_data.append( new_item ) | |
265 # grab next line for selected file | |
266 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' ) | |
267 # write relevant data to file | |
268 if current == old_current and current_data: | |
269 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) ) | |
270 elif current_data: | |
271 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) ) | |
272 last_lines = ''.join( current_lines ) | |
273 else: | |
274 last_lines = None | |
275 last_loc = loc | |
276 old_current = current | |
277 first_line = False | |
278 # fill trailing empty columns for final line | |
279 if last_loc < len( inputs ) - 1: | |
280 if not fill_empty: | |
281 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] | |
282 else: | |
283 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ] | |
284 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) ) | |
285 fout.write( '\n' ) | |
286 fout.close() | |
287 for f in tmp_input_files: | |
288 os.unlink( f.name ) | |
289 | |
290 if __name__ == "__main__" : __main__() |