Mercurial > repos > bgruening > text_processing
comparison easyjoin @ 0:5314e5d6f040 draft
Imported from capsule None
| author | bgruening | 
|---|---|
| date | Thu, 29 Jan 2015 07:53:17 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:5314e5d6f040 | 
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 ## EASY Join - | |
| 3 ## Join with automatic pre-sorting of both files | |
| 4 ## Copyright (C) 2010 A. Gordon (gordon@cshl.edu) | |
| 5 ## license: AGPLv3+ | |
| 6 use strict; | |
| 7 use warnings; | |
| 8 use Data::Dumper; | |
| 9 use Getopt::Long qw(:config bundling no_ignore_case_always); | |
| 10 use File::Temp qw/tempfile/; | |
| 11 use POSIX qw(locale_h); | |
| 12 | |
| 13 sub show_help(); | |
| 14 sub show_version(); | |
| 15 sub show_examples(); | |
| 16 sub parse_commandline_options(); | |
| 17 sub sort_file($$$); | |
| 18 sub join_files($$); | |
| 19 sub cleanup_files(@); | |
| 20 | |
| 21 | |
| 22 my $PROGRAM="easyjoin"; | |
| 23 my $VERSION="0.6.1"; | |
| 24 | |
| 25 my $debug=undef; | |
| 26 my $HEADER=undef; | |
| 27 my $IGNORE_CASE=undef; | |
| 28 my $FIELD_SEP=undef; | |
| 29 my $FILE1_KEY_COLUMN=1; | |
| 30 my $FILE2_KEY_COLUMN=1; | |
| 31 my @OUTPUT_SPECIFIERS=(); | |
| 32 my $OUTPUT_FORMAT=undef; | |
| 33 my $EMPTY_FILLER=undef; | |
| 34 my $SORT_BUFFER_SIZE=undef; | |
| 35 my $SORT_TEMP_DIR=undef; | |
| 36 my $input_filename1; | |
| 37 my $input_filename2; | |
| 38 | |
| 39 ## | |
| 40 ## Program Start | |
| 41 ## | |
| 42 $ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly | |
| 43 parse_commandline_options(); | |
| 44 my (undef, $tmp_filename1) = tempfile(OPEN=>0); | |
| 45 my (undef, $tmp_filename2) = tempfile(OPEN=>0); | |
| 46 sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN); | |
| 47 sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN); | |
| 48 my $join_exit_code = join_files($tmp_filename1, $tmp_filename2); | |
| 49 cleanup_files($tmp_filename1, $tmp_filename2); | |
| 50 exit($join_exit_code); | |
| 51 | |
| 52 ## | |
| 53 ## Program end | |
| 54 ## | |
| 55 | |
| 56 | |
| 57 sub show_help() | |
| 58 { | |
| 59 print<<EOF; | |
| 60 ${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them. | |
| 61 | |
| 62 Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2 | |
| 63 | |
| 64 OPTIONS: Options specific to this program: | |
| 65 | |
| 66 --header = Both input files have a header line as the first line. | |
| 67 The header line will be joined properly, without being sorted. | |
| 68 | |
| 69 --version = Print ${PROGRAM}'s version. | |
| 70 | |
| 71 --debug = Print debug messages (relating to ${PROGRAM}'s operation). | |
| 72 | |
| 73 --help = Show this help screen. | |
| 74 | |
| 75 --example = Show usage examples. | |
| 76 | |
| 77 --all = Short-cut for: | |
| 78 -a 1 -a 2 -o auto -e . -t <TAB> | |
| 79 This will show all values (paired and unpared) from both files, | |
| 80 Automatically formatting the columns, and using TAB as field separator. | |
| 81 You can override the empty filler (-e X) on the command line. | |
| 82 | |
| 83 --allh = Short-cut for: | |
| 84 -a 1 -a 2 -o auto -e . -t <TAB> --header | |
| 85 Same as above, but will also respect the header line from both input files. | |
| 86 | |
| 87 JOIN-OPTIONS: | |
| 88 All of GNU join options are supported. | |
| 89 Run: | |
| 90 join --help | |
| 91 To see all possible joining options. | |
| 92 | |
| 93 SORT-OPTIONS: | |
| 94 The following options are supported for the intermediate sorting step: | |
| 95 | |
| 96 -S SIZE | |
| 97 --buffer-size SIZE = GNU sort's --buffer-size option. | |
| 98 | |
| 99 -T DIR | |
| 100 --temporary-directory DIR = GNU sort's --temporary-directory option. | |
| 101 | |
| 102 Run: | |
| 103 sort --help | |
| 104 To learn about these options. They might improve sorting performances for big files. | |
| 105 | |
| 106 FILE1 FILE2: | |
| 107 The two input files to be sorted, joined. | |
| 108 Unlike GNU join, joining STDIN is not supported. Both files must be real files. | |
| 109 | |
| 110 | |
| 111 NOTE About "--header" and "--auto-format": | |
| 112 The "--header" feature requires GNU coreutils version 8.6 or later. | |
| 113 The "-o auto" feature requires GNU coreutils version 8.10 or later. | |
| 114 | |
| 115 EOF | |
| 116 exit(0); | |
| 117 } | |
| 118 | |
| 119 sub show_version() | |
| 120 { | |
| 121 print<<EOF; | |
| 122 $PROGRAM $VERSION | |
| 123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) | |
| 124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | |
| 125 | |
| 126 To see the GNU's join version, run: | |
| 127 join --version | |
| 128 EOF | |
| 129 exit(0); | |
| 130 } | |
| 131 | |
| 132 sub show_examples() | |
| 133 { | |
| 134 print<<EOF; | |
| 135 Example of joining two unsorted files (each file having a header line): | |
| 136 | |
| 137 \$ cat input1.txt | |
| 138 Fruit Color | |
| 139 Apple red | |
| 140 Banana yellow | |
| 141 Orange orange | |
| 142 Melon green | |
| 143 | |
| 144 \$ cat input2.txt | |
| 145 Fruit Price | |
| 146 Orange 7 | |
| 147 Avocado 8 | |
| 148 Apple 4 | |
| 149 Banana 3 | |
| 150 | |
| 151 \$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt | |
| 152 Fruit Color Price | |
| 153 Apple red 4 | |
| 154 Avocado . 8 | |
| 155 Banana yellow 3 | |
| 156 Melon green . | |
| 157 Orange orange 7 | |
| 158 | |
| 159 ## A short-cut for all the options above: | |
| 160 \$ easyjoin --allh input1.txt input2.txt | |
| 161 Fruit Color Price | |
| 162 Apple red 4 | |
| 163 Avocado . 8 | |
| 164 Banana yellow 3 | |
| 165 Melon green . | |
| 166 Orange orange 7 | |
| 167 | |
| 168 EOF | |
| 169 exit(0); | |
| 170 } | |
| 171 | |
| 172 sub parse_commandline_options() | |
| 173 { | |
| 174 ## | |
| 175 ## Parse command line | |
| 176 ## | |
| 177 my $rc = GetOptions( | |
| 178 "a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] }, | |
| 179 "e=s" => \$EMPTY_FILLER, | |
| 180 "ignore-case|i" => \$IGNORE_CASE, | |
| 181 "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; }, | |
| 182 "o=s" => \$OUTPUT_FORMAT, | |
| 183 "t=s" => \$FIELD_SEP, | |
| 184 "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] }, | |
| 185 "1=i" => \$FILE1_KEY_COLUMN, | |
| 186 "2=i" => \$FILE2_KEY_COLUMN, | |
| 187 "debug" => \$debug, | |
| 188 "header" => \$HEADER, | |
| 189 "help" => \&show_help, | |
| 190 "version" => \&show_version, | |
| 191 "examples" => \&show_examples, | |
| 192 "buffer-size|S=s" => \$SORT_BUFFER_SIZE, | |
| 193 "temporary-directory|T=s" => \$SORT_TEMP_DIR, | |
| 194 "all" => sub { | |
| 195 push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; | |
| 196 $FIELD_SEP = "\t"; | |
| 197 $OUTPUT_FORMAT = "auto"; | |
| 198 $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; | |
| 199 }, | |
| 200 "allh" => sub { | |
| 201 push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; | |
| 202 $FIELD_SEP = "\t"; | |
| 203 $OUTPUT_FORMAT = "auto"; | |
| 204 $HEADER=1; | |
| 205 $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; | |
| 206 }, | |
| 207 ); | |
| 208 die "$PROGRAM: invalid command-line arguments.\n" unless $rc; | |
| 209 | |
| 210 ## We need two file names to join | |
| 211 my @INPUT_FILES = @ARGV; | |
| 212 die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2); | |
| 213 die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2); | |
| 214 die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-"; | |
| 215 die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0]; | |
| 216 die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1]; | |
| 217 | |
| 218 $input_filename1 = $INPUT_FILES[0]; | |
| 219 $input_filename2 = $INPUT_FILES[1]; | |
| 220 } | |
| 221 | |
| 222 sub sort_file($$$) | |
| 223 { | |
| 224 my ($input_filename, $output_filename, $key_column) = @_; | |
| 225 | |
| 226 my @SORT_COMMAND; | |
| 227 push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ; | |
| 228 push @SORT_COMMAND, "-f" if $IGNORE_CASE; | |
| 229 push @SORT_COMMAND, "-k${key_column},${key_column}" ; | |
| 230 push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE; | |
| 231 push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR; | |
| 232 push @SORT_COMMAND, "--output", $output_filename; | |
| 233 push @SORT_COMMAND, "--debugheader" if $debug && $HEADER; | |
| 234 push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP; | |
| 235 push @SORT_COMMAND, $input_filename; | |
| 236 | |
| 237 if ($debug) { | |
| 238 warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n"; | |
| 239 warn "$PROGRAM: Sort command line:\n"; | |
| 240 print STDERR Dumper(\@SORT_COMMAND), "\n"; | |
| 241 } | |
| 242 | |
| 243 my $sort_exit_code=1; | |
| 244 system(@SORT_COMMAND); | |
| 245 if ($? == -1) { | |
| 246 die "$PROGRAM: Error: failed to execute 'sort': $!\n"; | |
| 247 } | |
| 248 elsif ($? & 127) { | |
| 249 my $signal = ($? & 127); | |
| 250 kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide | |
| 251 die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; | |
| 252 } | |
| 253 else { | |
| 254 $sort_exit_code = ($? >> 8); | |
| 255 } | |
| 256 die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0; | |
| 257 } | |
| 258 | |
| 259 sub join_files($$) | |
| 260 { | |
| 261 my ($file1, $file2) = @_; | |
| 262 | |
| 263 my @join_command = qw/join/; | |
| 264 push @join_command, "--header" if $HEADER; | |
| 265 push @join_command, "--ignore-case" if $IGNORE_CASE; | |
| 266 push @join_command, "-t", $FIELD_SEP if $FIELD_SEP; | |
| 267 push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN; | |
| 268 push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN; | |
| 269 push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER; | |
| 270 push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT; | |
| 271 push @join_command, @OUTPUT_SPECIFIERS; | |
| 272 push @join_command, $file1, $file2; | |
| 273 | |
| 274 if ($debug) { | |
| 275 warn "$PROGRAM: Running join on '$file1' and '$file2'\n"; | |
| 276 warn "$PROGRAM: join command line:\n"; | |
| 277 print STDERR Dumper(\@join_command), "\n"; | |
| 278 } | |
| 279 | |
| 280 my $join_exit_code=1; | |
| 281 system(@join_command); | |
| 282 if ($? == -1) { | |
| 283 die "$PROGRAM: Error: failed to execute 'join': $!\n"; | |
| 284 } | |
| 285 elsif ($? & 127) { | |
| 286 my $signal = ($? & 127); | |
| 287 kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide | |
| 288 die "$PROGRAM: Error: 'join' child-process died with signal $signal\n"; | |
| 289 } | |
| 290 else { | |
| 291 $join_exit_code = ($? >> 8); | |
| 292 } | |
| 293 return $join_exit_code; | |
| 294 } | |
| 295 | |
| 296 sub cleanup_files(@) | |
| 297 { | |
| 298 my (@files) = @_; | |
| 299 | |
| 300 foreach my $file (@files) { | |
| 301 if ($debug) { | |
| 302 warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n"; | |
| 303 } else { | |
| 304 my $count = unlink $file; | |
| 305 warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1); | |
| 306 } | |
| 307 } | |
| 308 } | 
