annotate column_join.py @ 0:6bb6c0a30c67 draft default tip

Uploaded
author jjohnson
date Tue, 01 Apr 2014 09:30:45 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
2
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
3 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
4 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
5
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
6 usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
7 -o, output=0: the output pileup
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
8 -1, input1=1: the pileup file to start with
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
9 -2, input2=2: the second pileup file to join
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
10 -g, hinge=h: the columns to be used for matching
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
11 -c, columns=c: the columns that should appear in the output
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
12 -f, fill_options_file=f: the file specifying the fill value to use
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
13 other_inputs: the other input files to join
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
14 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
15
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
16 import optparse, os, re, struct, sys, tempfile
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
17 from galaxy.util.bunch import Bunch
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
18 from galaxy.util import stringify_dictionary_keys
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
19 import json
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
20
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
21 def stop_err( msg ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
22 sys.stderr.write( msg )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
23 sys.exit()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
24
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
25 def split_nums( text ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
26 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
27 Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
28 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
29 split_t = []
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
30 c = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
31 n = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
32 for ch in text:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
33 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
34 v = int( ch )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
35 n += ch
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
36 if c:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
37 split_t.append( ''.join( c ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
38 c = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
39 except ValueError:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
40 c += ch
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
41 if n:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
42 split_t.append( int( ''.join( n ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
43 n = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
44 if c:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
45 split_t.append( ''.join( c ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
46 if n:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
47 split_t.append( int( ''.join( n ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
48 return split_t
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
49
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
50 def hinge_compare( hinge1, hinge2 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
51 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
52 Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
53 first part handled as text but last part as number
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
54 """
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
55 split_hinge1 = hinge1.split( '\t' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
56 split_hinge2 = hinge2.split( '\t' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
57 # quick check if either hinge is empty
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
58 if not ''.join( split_hinge2 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
59 if ''.join( split_hinge1 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
60 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
61 elif not ''.join( split_hinge1 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
62 return 0
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
63 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
64 if not ''.join( split_hinge1 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
65 return -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
66 # go through all parts of the hinges and compare
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
67 for i, sh1 in enumerate( split_hinge1 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
68 # if these hinge segments are the same, just move on to the next ones
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
69 if sh1 == split_hinge2[ i ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
70 continue
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
71 # check all parts of each hinge
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
72 h1 = split_nums( sh1 )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
73 h2 = split_nums( split_hinge2[ i ] )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
74 for j, h in enumerate( h1 ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
75 # if second hinge has no more parts, first is considered larger
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
76 if j > 0 and len( h2 ) <= j:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
77 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
78 # if these two parts are the same, move on to next
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
79 if h == h2[ j ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
80 continue
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
81 # do actual comparison, depending on whether letter or number
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
82 if type( h ) == int:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
83 if type( h2[ j ] ) == int:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
84 if h > h2[ j ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
85 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
86 elif h < h2[ j ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
87 return -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
88 # numbers are less than letters
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
89 elif type( h2[ j ] ) == str:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
90 return -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
91 elif type( h ) == str:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
92 if type( h2[ j ] ) == str:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
93 if h > h2[ j ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
94 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
95 elif h < h2[ j ]:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
96 return -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
97 # numbers are less than letters
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
98 elif type( h2[ j ] ) == int:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
99 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
100 # if all else has failed, just do basic string comparison
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
101 if hinge1 > hinge2:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
102 return 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
103 elif hinge1 == hinge2:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
104 return 0
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
105 elif hinge1 < hinge2:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
106 return -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
107
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
108 def hinge_sort( infile, outfile, hinge ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
109 """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
110 hinge_locs = {}
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
111 bad_lines = []
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
112 fin = open( infile, 'rb' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
113 line = fin.readline()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
114 while line.strip():
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
115 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
116 hinge_parts = line.split( '\t' )[ :hinge ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
117 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
118 hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
119 except KeyError:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
120 hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
121 except ValueError:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
122 bad_line.append( line )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
123 line = fin.readline()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
124 fin.close()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
125 fin = open( infile, 'rb' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
126 fout = open( outfile, 'wb' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
127 hinge_locs_sorted = hinge_locs.keys()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
128 hinge_locs_sorted.sort( hinge_compare )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
129 for hinge_loc in hinge_locs_sorted:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
130 locs = hinge_locs[ hinge_loc ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
131 for loc in locs:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
132 fin.seek( loc )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
133 fout.write( fin.readline() )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
134 fout.close()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
135 fin.close()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
136
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
137 def __main__():
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
138 parser = optparse.OptionParser()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
139 parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
140 parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
141 parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
142 parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
143 parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
144 parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
145 (options, args) = parser.parse_args()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
146 hinge = int( options.hinge )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
147 cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
148 inputs = [ options.input1, options.input2 ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
149 if options.fill_options_file == 'None':
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
150 inputs.extend( args )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
151 elif len( args ) > 0:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
152 inputs.extend( args )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
153 fill_options = None
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
154 if options.fill_options_file != 'None' and options.fill_options_file is not None:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
155 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
156 fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
157 except Exception, e:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
158 print 'Warning: Ignoring fill options due to json error (%s).' % e
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
159 if fill_options is None:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
160 fill_options = Bunch()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
161 if 'file1_columns' not in fill_options:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
162 fill_options.file1_columns = None
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
163 if fill_options and fill_options.file1_columns:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
164 fill_empty = {}
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
165 for col in cols:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
166 fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
167 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
168 fill_empty = None
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
169 assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
170 delimiter = '\t'
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
171 # make sure all files are sorted in same way, ascending
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
172 tmp_input_files = []
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
173 input_files = inputs[:]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
174 for in_file in input_files:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
175 tmp_file = tempfile.NamedTemporaryFile()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
176 tmp_file_name = tmp_file.name
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
177 tmp_file.close()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
178 hinge_sort( in_file, tmp_file_name, hinge )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
179 tmp_file = open( tmp_file_name, 'rb' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
180 tmp_input_files.append( tmp_file )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
181 # cycle through files, getting smallest line of all files one at a time
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
182 # also have to keep track of vertical position of extra columns
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
183 fout = file( options.output, 'w' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
184 old_current = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
185 first_line = True
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
186 current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
187 last_lines = ''.join( current_lines )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
188 last_loc = -1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
189 while last_lines:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
190 # get the "minimum" hinge, which should come first, and the file location in list
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
191 hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
192 hinge_dict = {}
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
193 for i in range( len( hinges ) ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
194 if not hinge_dict.has_key( hinges[ i ] ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
195 hinge_dict[ hinges[ i ] ] = i
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
196 hinges.sort( hinge_compare )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
197 hinges = [ h for h in hinges if h ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
198 current, loc = hinges[0], hinge_dict[ hinges[0] ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
199 # first output empty columns for vertical alignment (account for "missing" files)
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
200 # write output for leading and trailing empty columns
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
201 # columns missing from actual file handled further below
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
202 current_data = []
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
203 if current != old_current:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
204 # fill trailing empty columns with appropriate fill value
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
205 if not first_line:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
206 if last_loc < len( inputs ) - 1:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
207 if not fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
208 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
209 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
210 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
211 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
212 # insert line break before current line
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
213 fout.write( '\n' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
214 # fill leading empty columns with appropriate fill value
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
215 if loc > 0:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
216 if not fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
217 current_data = [ '' for col in range( loc * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
218 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
219 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
220 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
221 if loc - last_loc > 1:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
222 if not fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
223 current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
224 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
225 current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
226 # now output actual data
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
227 split_line = current_lines[ loc ].split( delimiter )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
228 # fill empties within actual line if appropriate
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
229 if fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
230 new_split_line = split_line[:]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
231 split_line = []
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
232 for i, item in enumerate( new_split_line ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
233 col = i + 1
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
234 if not item:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
235 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
236 split_line.append( fill_empty[ i + 1 ] )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
237 except KeyError:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
238 split_line.append( item )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
239 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
240 split_line.append( item )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
241 # add actual data to be output below
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
242 if ''.join( split_line ):
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
243 for col in cols:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
244 if col > hinge:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
245 # if this column doesn't exist, add the appropriate filler or empty column
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
246 try:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
247 new_item = split_line[ col - 1 ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
248 except IndexError:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
249 if fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
250 new_item = fill_empty[ col ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
251 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
252 new_item = ''
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
253 current_data.append( new_item )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
254 # grab next line for selected file
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
255 current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
256 # write relevant data to file
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
257 if current == old_current and current_data:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
258 fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
259 elif current_data:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
260 fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
261 last_lines = ''.join( current_lines )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
262 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
263 last_lines = None
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
264 last_loc = loc
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
265 old_current = current
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
266 first_line = False
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
267 # fill trailing empty columns for final line
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
268 if last_loc < len( inputs ) - 1:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
269 if not fill_empty:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
270 filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
271 else:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
272 filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
273 fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
274 fout.write( '\n' )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
275 fout.close()
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
276 for f in tmp_input_files:
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
277 os.unlink( f.name )
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
278
6bb6c0a30c67 Uploaded
jjohnson
parents:
diff changeset
279 if __name__ == "__main__" : __main__()