| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 ## EASY Join -
 | 
| 
 | 
     3 ## Join with automatic pre-sorting of both files
 | 
| 
 | 
     4 ## Copyright (C) 2010 A. Gordon (gordon@cshl.edu)
 | 
| 
 | 
     5 ## license: AGPLv3+
 | 
| 
 | 
     6 use strict;
 | 
| 
 | 
     7 use warnings;
 | 
| 
 | 
     8 use Data::Dumper;
 | 
| 
 | 
     9 use Getopt::Long qw(:config bundling no_ignore_case_always);
 | 
| 
 | 
    10 use File::Temp qw/tempfile/;
 | 
| 
 | 
    11 use POSIX qw(locale_h);
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 sub show_help();
 | 
| 
 | 
    14 sub show_version();
 | 
| 
 | 
    15 sub show_examples();
 | 
| 
 | 
    16 sub parse_commandline_options();
 | 
| 
 | 
    17 sub sort_file($$$);
 | 
| 
 | 
    18 sub join_files($$);
 | 
| 
 | 
    19 sub cleanup_files(@);
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 my $PROGRAM="easyjoin";
 | 
| 
 | 
    23 my $VERSION="0.6.1";
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 my $debug=undef;
 | 
| 
 | 
    26 my $HEADER=undef;
 | 
| 
 | 
    27 my $IGNORE_CASE=undef;
 | 
| 
 | 
    28 my $FIELD_SEP=undef;
 | 
| 
 | 
    29 my $FILE1_KEY_COLUMN=1;
 | 
| 
 | 
    30 my $FILE2_KEY_COLUMN=1;
 | 
| 
 | 
    31 my @OUTPUT_SPECIFIERS=();
 | 
| 
 | 
    32 my $OUTPUT_FORMAT=undef;
 | 
| 
 | 
    33 my $EMPTY_FILLER=undef;
 | 
| 
 | 
    34 my $SORT_BUFFER_SIZE=undef;
 | 
| 
 | 
    35 my $SORT_TEMP_DIR=undef;
 | 
| 
 | 
    36 my $input_filename1;
 | 
| 
 | 
    37 my $input_filename2;
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 ##
 | 
| 
 | 
    40 ## Program Start
 | 
| 
 | 
    41 ##
 | 
| 
 | 
    42 $ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly
 | 
| 
 | 
    43 parse_commandline_options();
 | 
| 
 | 
    44 my (undef, $tmp_filename1) = tempfile(OPEN=>0);
 | 
| 
 | 
    45 my (undef, $tmp_filename2) = tempfile(OPEN=>0);
 | 
| 
 | 
    46 sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN);
 | 
| 
 | 
    47 sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN);
 | 
| 
 | 
    48 my $join_exit_code = join_files($tmp_filename1, $tmp_filename2);
 | 
| 
 | 
    49 cleanup_files($tmp_filename1, $tmp_filename2);
 | 
| 
 | 
    50 exit($join_exit_code);
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 ##
 | 
| 
 | 
    53 ## Program end
 | 
| 
 | 
    54 ##
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 sub show_help()
 | 
| 
 | 
    58 {
 | 
| 
 | 
    59 print<<EOF;
 | 
| 
 | 
    60 ${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them.
 | 
| 
 | 
    61 
 | 
| 
 | 
    62 Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 OPTIONS: Options specific to this program:
 | 
| 
 | 
    65 
 | 
| 
 | 
    66    --header      =  Both input files have a header line as the first line.
 | 
| 
 | 
    67                     The header line will be joined properly, without being sorted.
 | 
| 
 | 
    68 
 | 
| 
 | 
    69    --version     =  Print ${PROGRAM}'s version.
 | 
| 
 | 
    70 
 | 
| 
 | 
    71    --debug       =  Print debug messages (relating to ${PROGRAM}'s operation).
 | 
| 
 | 
    72 
 | 
| 
 | 
    73    --help        =  Show this help screen.
 | 
| 
 | 
    74 
 | 
| 
 | 
    75    --example     =  Show usage examples.
 | 
| 
 | 
    76 
 | 
| 
 | 
    77    --all         =  Short-cut for:
 | 
| 
 | 
    78                       -a 1 -a 2 -o auto -e . -t <TAB>
 | 
| 
 | 
    79                     This will show all values (paired and unpared) from both files,
 | 
| 
 | 
    80 		    Automatically formatting the columns, and using TAB as field separator.
 | 
| 
 | 
    81 		    You can override the empty filler (-e X) on the command line.
 | 
| 
 | 
    82 
 | 
| 
 | 
    83    --allh        =  Short-cut for:
 | 
| 
 | 
    84                        -a 1 -a 2 -o auto -e . -t <TAB> --header
 | 
| 
 | 
    85 		    Same as above, but will also respect the header line from both input files.
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 JOIN-OPTIONS:
 | 
| 
 | 
    88    All of GNU join options are supported.
 | 
| 
 | 
    89    Run:
 | 
| 
 | 
    90        join --help
 | 
| 
 | 
    91    To see all possible joining options.
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 SORT-OPTIONS:
 | 
| 
 | 
    94    The following options are supported for the intermediate sorting step:
 | 
| 
 | 
    95 
 | 
| 
 | 
    96    -S SIZE
 | 
| 
 | 
    97    --buffer-size SIZE   = GNU sort's --buffer-size option.
 | 
| 
 | 
    98 
 | 
| 
 | 
    99    -T DIR
 | 
| 
 | 
   100    --temporary-directory DIR = GNU sort's --temporary-directory option.
 | 
| 
 | 
   101 
 | 
| 
 | 
   102    Run:
 | 
| 
 | 
   103       sort --help
 | 
| 
 | 
   104    To learn about these options. They might improve sorting performances for big files.
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 FILE1 FILE2:
 | 
| 
 | 
   107    The two input files to be sorted, joined.
 | 
| 
 | 
   108    Unlike GNU join,  joining STDIN is not supported. Both files must be real files.
 | 
| 
 | 
   109 
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 NOTE About "--header" and "--auto-format":
 | 
| 
 | 
   112    The "--header" feature requires GNU coreutils version 8.6 or later.
 | 
| 
 | 
   113    The "-o auto" feature requires GNU coreutils version 8.10 or later.
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 EOF
 | 
| 
 | 
   116 	exit(0);
 | 
| 
 | 
   117 }
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 sub show_version()
 | 
| 
 | 
   120 {
 | 
| 
 | 
   121 print<<EOF;
 | 
| 
 | 
   122 $PROGRAM $VERSION
 | 
| 
 | 
   123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu)
 | 
| 
 | 
   124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html)
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 To see the GNU's join version, run:
 | 
| 
 | 
   127 	join --version
 | 
| 
 | 
   128 EOF
 | 
| 
 | 
   129 	exit(0);
 | 
| 
 | 
   130 }
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 sub show_examples()
 | 
| 
 | 
   133 {
 | 
| 
 | 
   134 print<<EOF;
 | 
| 
 | 
   135 Example of joining two unsorted files (each file having a header line):
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 \$ cat input1.txt
 | 
| 
 | 
   138 Fruit	Color
 | 
| 
 | 
   139 Apple	red
 | 
| 
 | 
   140 Banana	yellow
 | 
| 
 | 
   141 Orange	orange
 | 
| 
 | 
   142 Melon	green
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 \$ cat input2.txt
 | 
| 
 | 
   145 Fruit	Price
 | 
| 
 | 
   146 Orange	7
 | 
| 
 | 
   147 Avocado	8
 | 
| 
 | 
   148 Apple	4
 | 
| 
 | 
   149 Banana	3
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 \$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt
 | 
| 
 | 
   152 Fruit   Color   Price
 | 
| 
 | 
   153 Apple   red     4
 | 
| 
 | 
   154 Avocado .       8
 | 
| 
 | 
   155 Banana  yellow  3
 | 
| 
 | 
   156 Melon   green   .
 | 
| 
 | 
   157 Orange  orange  7
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 ## A short-cut for all the options above:
 | 
| 
 | 
   160 \$ easyjoin --allh input1.txt input2.txt
 | 
| 
 | 
   161 Fruit   Color   Price
 | 
| 
 | 
   162 Apple   red     4
 | 
| 
 | 
   163 Avocado .       8
 | 
| 
 | 
   164 Banana  yellow  3
 | 
| 
 | 
   165 Melon   green   .
 | 
| 
 | 
   166 Orange  orange  7
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 EOF
 | 
| 
 | 
   169 	exit(0);
 | 
| 
 | 
   170 }
 | 
| 
 | 
   171 
 | 
| 
 | 
   172 sub parse_commandline_options()
 | 
| 
 | 
   173 {
 | 
| 
 | 
   174 	##
 | 
| 
 | 
   175 	## Parse command line
 | 
| 
 | 
   176 	##
 | 
| 
 | 
   177 	my $rc = GetOptions(
 | 
| 
 | 
   178 			"a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] },
 | 
| 
 | 
   179 			"e=s" => \$EMPTY_FILLER,
 | 
| 
 | 
   180 			"ignore-case|i" => \$IGNORE_CASE,
 | 
| 
 | 
   181 			"j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; },
 | 
| 
 | 
   182 			"o=s" => \$OUTPUT_FORMAT,
 | 
| 
 | 
   183 			"t=s" => \$FIELD_SEP,
 | 
| 
 | 
   184 			"v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] },
 | 
| 
 | 
   185 			"1=i" => \$FILE1_KEY_COLUMN,
 | 
| 
 | 
   186 			"2=i" => \$FILE2_KEY_COLUMN,
 | 
| 
 | 
   187 			"debug" => \$debug,
 | 
| 
 | 
   188 			"header" => \$HEADER,
 | 
| 
 | 
   189 			"help" => \&show_help,
 | 
| 
 | 
   190 			"version" => \&show_version,
 | 
| 
 | 
   191 			"examples" => \&show_examples,
 | 
| 
 | 
   192 			"buffer-size|S=s" => \$SORT_BUFFER_SIZE,
 | 
| 
 | 
   193 			"temporary-directory|T=s" => \$SORT_TEMP_DIR,
 | 
| 
 | 
   194 			"all" => sub {
 | 
| 
 | 
   195 					push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
 | 
| 
 | 
   196 					$FIELD_SEP = "\t";
 | 
| 
 | 
   197 					$OUTPUT_FORMAT = "auto";
 | 
| 
 | 
   198 					$EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
 | 
| 
 | 
   199 				},
 | 
| 
 | 
   200 			"allh" => sub {
 | 
| 
 | 
   201 					push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
 | 
| 
 | 
   202 					$FIELD_SEP = "\t";
 | 
| 
 | 
   203 					$OUTPUT_FORMAT = "auto";
 | 
| 
 | 
   204 					$HEADER=1;
 | 
| 
 | 
   205 					$EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
 | 
| 
 | 
   206 				},
 | 
| 
 | 
   207 		);
 | 
| 
 | 
   208 	die "$PROGRAM: invalid command-line arguments.\n" unless $rc;
 | 
| 
 | 
   209 
 | 
| 
 | 
   210 	## We need two file names to join
 | 
| 
 | 
   211 	my @INPUT_FILES = @ARGV;
 | 
| 
 | 
   212 	die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2);
 | 
| 
 | 
   213 	die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2);
 | 
| 
 | 
   214 	die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-";
 | 
| 
 | 
   215 	die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0];
 | 
| 
 | 
   216 	die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1];
 | 
| 
 | 
   217 
 | 
| 
 | 
   218 	$input_filename1 = $INPUT_FILES[0];
 | 
| 
 | 
   219 	$input_filename2 = $INPUT_FILES[1];
 | 
| 
 | 
   220 }
 | 
| 
 | 
   221 
 | 
| 
 | 
   222 sub sort_file($$$)
 | 
| 
 | 
   223 {
 | 
| 
 | 
   224 	my ($input_filename, $output_filename, $key_column) = @_;
 | 
| 
 | 
   225 
 | 
| 
 | 
   226 	my @SORT_COMMAND;
 | 
| 
 | 
   227 	push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ;
 | 
| 
 | 
   228 	push @SORT_COMMAND, "-f" if $IGNORE_CASE;
 | 
| 
 | 
   229 	push @SORT_COMMAND, "-k${key_column},${key_column}" ;
 | 
| 
 | 
   230 	push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE;
 | 
| 
 | 
   231 	push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR;
 | 
| 
 | 
   232 	push @SORT_COMMAND, "--output", $output_filename;
 | 
| 
 | 
   233 	push @SORT_COMMAND, "--debugheader" if $debug && $HEADER;
 | 
| 
 | 
   234 	push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP;
 | 
| 
 | 
   235 	push @SORT_COMMAND, $input_filename;
 | 
| 
 | 
   236 
 | 
| 
 | 
   237 	if ($debug) {
 | 
| 
 | 
   238 		warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n";
 | 
| 
 | 
   239 		warn "$PROGRAM: Sort command line:\n";
 | 
| 
 | 
   240 		print STDERR Dumper(\@SORT_COMMAND), "\n";
 | 
| 
 | 
   241 	}
 | 
| 
 | 
   242 
 | 
| 
 | 
   243 	my $sort_exit_code=1;
 | 
| 
 | 
   244 	system(@SORT_COMMAND);
 | 
| 
 | 
   245 	if ($? == -1) {
 | 
| 
 | 
   246 		die "$PROGRAM: Error: failed to execute 'sort': $!\n";
 | 
| 
 | 
   247 	}
 | 
| 
 | 
   248 	elsif ($? & 127) {
 | 
| 
 | 
   249 		my $signal = ($? & 127);
 | 
| 
 | 
   250 		kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide
 | 
| 
 | 
   251 		die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n";
 | 
| 
 | 
   252 	}
 | 
| 
 | 
   253 	else {
 | 
| 
 | 
   254 		$sort_exit_code = ($? >> 8);
 | 
| 
 | 
   255 	}
 | 
| 
 | 
   256 	die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0;
 | 
| 
 | 
   257 }
 | 
| 
 | 
   258 
 | 
| 
 | 
   259 sub join_files($$)
 | 
| 
 | 
   260 {
 | 
| 
 | 
   261 	my ($file1, $file2) = @_;
 | 
| 
 | 
   262 
 | 
| 
 | 
   263 	my @join_command = qw/join/;
 | 
| 
 | 
   264 	push @join_command, "--header" if $HEADER;
 | 
| 
 | 
   265 	push @join_command, "--ignore-case" if $IGNORE_CASE;
 | 
| 
 | 
   266 	push @join_command, "-t", $FIELD_SEP if $FIELD_SEP;
 | 
| 
 | 
   267 	push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN;
 | 
| 
 | 
   268 	push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN;
 | 
| 
 | 
   269 	push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER;
 | 
| 
 | 
   270 	push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT;
 | 
| 
 | 
   271 	push @join_command, @OUTPUT_SPECIFIERS;
 | 
| 
 | 
   272 	push @join_command, $file1, $file2;
 | 
| 
 | 
   273 
 | 
| 
 | 
   274 	if ($debug) {
 | 
| 
 | 
   275 		warn "$PROGRAM: Running join on '$file1'  and '$file2'\n";
 | 
| 
 | 
   276 		warn "$PROGRAM: join command line:\n";
 | 
| 
 | 
   277 		print STDERR Dumper(\@join_command), "\n";
 | 
| 
 | 
   278 	}
 | 
| 
 | 
   279 
 | 
| 
 | 
   280 	my $join_exit_code=1;
 | 
| 
 | 
   281 	system(@join_command);
 | 
| 
 | 
   282 	if ($? == -1) {
 | 
| 
 | 
   283 		die "$PROGRAM: Error: failed to execute 'join': $!\n";
 | 
| 
 | 
   284 	}
 | 
| 
 | 
   285 	elsif ($? & 127) {
 | 
| 
 | 
   286 		my $signal = ($? & 127);
 | 
| 
 | 
   287 		kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide
 | 
| 
 | 
   288 		die "$PROGRAM: Error: 'join' child-process died with signal $signal\n";
 | 
| 
 | 
   289 	}
 | 
| 
 | 
   290 	else {
 | 
| 
 | 
   291 		$join_exit_code = ($? >> 8);
 | 
| 
 | 
   292 	}
 | 
| 
 | 
   293 	return $join_exit_code;
 | 
| 
 | 
   294 }
 | 
| 
 | 
   295 
 | 
| 
 | 
   296 sub cleanup_files(@)
 | 
| 
 | 
   297 {
 | 
| 
 | 
   298 	my (@files) = @_;
 | 
| 
 | 
   299 
 | 
| 
 | 
   300 	foreach my $file (@files) {
 | 
| 
 | 
   301 		if ($debug) {
 | 
| 
 | 
   302 			warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n";
 | 
| 
 | 
   303 		} else {
 | 
| 
 | 
   304 			my $count = unlink $file;
 | 
| 
 | 
   305 			warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1);
 | 
| 
 | 
   306 		}
 | 
| 
 | 
   307 	}
 | 
| 
 | 
   308 }
 |