Mercurial > repos > bgruening > text_processing
annotate multijoin @ 17:f46f0e4f75c4 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 5f5d5802a961a77ceb092cbdef90d93e29717029-dirty"
| author | bgruening |
|---|---|
| date | Tue, 22 Jun 2021 16:06:48 +0000 |
| parents | 20344ce0c811 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env perl |
| 2 use strict; | |
| 3 use warnings; | |
| 4 use Getopt::Long qw(:config no_ignore_case); | |
| 5 use Data::Dumper; | |
| 6 use Carp; | |
| 7 use File::Basename; | |
| 8 | |
| 9 my $version = "0.1.1"; | |
| 10 my $field_sep = "\t"; | |
| 11 my $key_column; | |
| 12 my @values_columns; | |
| 13 my $max_value_column; | |
| 14 my @input_files; | |
| 15 my $input_headers ; | |
| 16 my $output_headers; | |
| 17 my $filler = "0"; | |
| 18 my $filler_string ; | |
| 19 my $ignore_duplicates; | |
| 20 my $debug = 0 ; | |
| 21 my %input_headers; | |
| 22 my $have_file_labels; | |
| 23 my %file_labels; | |
| 24 | |
| 25 sub parse_command_line_parameters(); | |
| 26 sub show_help(); | |
| 27 sub read_input_file($); | |
| 28 sub print_combined_data(); | |
| 29 sub sanitize_filename($); | |
| 30 sub print_output_header(); | |
| 31 sub show_examples(); | |
| 32 | |
| 33 ## | |
| 34 ## Program Start | |
| 35 ## | |
| 36 | |
| 37 parse_command_line_parameters(); | |
| 38 | |
| 39 my %data; | |
| 40 foreach my $file (@input_files) { | |
| 41 read_input_file($file); | |
| 42 } | |
| 43 #print STDERR Dumper(\%input_headers),"\n"; | |
| 44 #print STDERR Dumper(\%data) if $debug; | |
| 45 print_output_header() if $output_headers; | |
| 46 print_combined_data(); | |
| 47 | |
| 48 | |
| 49 ## | |
| 50 ## Program End | |
| 51 ## | |
| 52 sub print_output_header() | |
| 53 { | |
| 54 my @output = ("key"); | |
| 55 foreach my $file ( @input_files ) { | |
| 56 foreach my $column ( @values_columns ) { | |
| 57 my $column_name = ( exists $input_headers{$file}->{$column} ) ? | |
| 58 $input_headers{$file}->{$column} : | |
| 59 "V$column" ; | |
| 60 | |
| 61 push @output, $file_labels{$file} . "_" . $column_name; | |
| 62 } | |
| 63 } | |
| 64 print join($field_sep,@output),"\n" | |
| 65 or die "Output error: can't write output line: $!\n"; | |
| 66 } | |
| 67 | |
| 68 sub print_combined_data() | |
| 69 { | |
|
5
20344ce0c811
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit b9d202134c3c6d0e5c398c3ae75e410067fcfc52
bgruening
parents:
0
diff
changeset
|
70 my @keys = sort keys %data ; |
| 0 | 71 |
| 72 foreach my $key ( @keys ) { | |
| 73 my @outputs; | |
| 74 | |
| 75 foreach my $file (@input_files) { | |
| 76 push @outputs, | |
| 77 (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string; | |
| 78 } | |
| 79 | |
| 80 print join($field_sep,$key,@outputs),"\n" | |
| 81 or die "Output error: can't write output line: $!\n"; | |
| 82 } | |
| 83 } | |
| 84 | |
| 85 sub sanitize_filename($) | |
| 86 { | |
| 87 my ($filename) = shift or croak "missing file name"; | |
| 88 my $file_ID = basename($filename); | |
| 89 $file_ID =~ s/\.\w+$//; # remove extension | |
| 90 $file_ID =~ s/^[^\w\.\-]+//; | |
| 91 $file_ID =~ s/[^\w\.\-]+$//; | |
| 92 $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters | |
| 93 return $file_ID; | |
| 94 } | |
| 95 | |
| 96 sub read_input_file($) | |
| 97 { | |
| 98 my ($filename) = shift or croak "Missing input file name"; | |
| 99 | |
| 100 my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns | |
| 101 | |
| 102 open FILE, "<", $filename | |
| 103 or die "Error: can't open file '$filename': $!\n"; | |
| 104 | |
| 105 ## Read file's header | |
| 106 if ($input_headers) { | |
| 107 my $line = <FILE>; | |
| 108 chomp $line; | |
| 109 my @fields = split $field_sep, $line; | |
| 110 | |
| 111 my $num_input_fields = scalar(@fields); | |
| 112 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; | |
| 113 | |
| 114 foreach my $col (@values_columns) { | |
| 115 $input_headers{$filename}->{$col} = $fields[$col-1] ; | |
| 116 } | |
| 117 } | |
| 118 | |
| 119 | |
| 120 ## Read file's data | |
| 121 while ( my $line = <FILE> ) { | |
| 122 chomp $line; | |
| 123 my @fields = split $field_sep, $line; | |
| 124 | |
| 125 my $num_input_fields = scalar(@fields); | |
| 126 die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ; | |
| 127 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; | |
| 128 | |
| 129 | |
| 130 my $key = $fields[$key_column-1]; | |
| 131 my $value = join($field_sep, @fields[@value_indexes]); | |
| 132 | |
| 133 die "Input error: file '$filename' line $. have duplicated key '$key'.\n" | |
| 134 if (exists $data{$key}->{$filename} && !$ignore_duplicates) ; | |
| 135 $data{$key}->{$filename} = $value; | |
| 136 } | |
| 137 close FILE | |
| 138 or die "Error: can't write and close file '$filename': $!\n"; | |
| 139 } | |
| 140 | |
| 141 sub parse_command_line_parameters() | |
| 142 { | |
| 143 my $values_columns_string; | |
| 144 | |
| 145 my $rc = GetOptions("help" => \&show_help, | |
| 146 "key|k=i" => \$key_column, | |
| 147 "values|v=s" => \$values_columns_string, | |
| 148 "t=s" => \$field_sep, | |
| 149 "in-header" => \$input_headers, | |
| 150 "out-header|h" => \$output_headers, | |
| 151 "H" => sub { $input_headers = 1 ; $output_headers = 1 ; }, | |
| 152 "ignore-dups" => \$ignore_duplicates, | |
| 153 "filler|f=s" => \$filler, | |
| 154 "examples" => \&show_examples, | |
| 155 "labels" => \$have_file_labels, | |
| 156 ); | |
| 157 die "Error: inalid command-line parameters.\n" unless $rc; | |
| 158 | |
| 159 die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column; | |
| 160 die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ; | |
| 161 | |
| 162 die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string; | |
| 163 @values_columns = split(/\s*,\s*/, $values_columns_string); | |
| 164 | |
| 165 die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0; | |
| 166 foreach my $v (@values_columns) { | |
| 167 die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n" | |
| 168 unless $v =~ /^\d+$/ && $v>=1; | |
| 169 | |
| 170 $max_value_column = $v unless defined $max_value_column && $max_value_column>$v; | |
| 171 } | |
| 172 | |
| 173 $filler_string = join($field_sep, map { $filler } @values_columns); | |
| 174 | |
| 175 | |
| 176 if ($have_file_labels) { | |
| 177 ## have file labels - each pair of parameters is a file/label pair. | |
| 178 die "Error: missing input files and labels\n" if scalar(@ARGV)==0; | |
| 179 die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0; | |
| 180 | |
| 181 while (@ARGV) { | |
| 182 my $filename = shift @ARGV; | |
| 183 my $label = shift @ARGV; | |
| 184 $label =~ s/^[^\.\w\-]+//; | |
| 185 $label =~ s/[^\.\w\-]+$//g; | |
| 186 $label =~ s/[^\.\w\-]+/_/g; | |
| 187 | |
| 188 my $file_ID = sanitize_filename($filename); | |
| 189 $file_labels{$filename} = $label; | |
| 190 push @input_files, $filename; | |
| 191 } | |
| 192 } else { | |
| 193 ## no file labels - the rest of the arguments are just file names; | |
| 194 @input_files = @ARGV; | |
| 195 die "Error: missing input files\n" if scalar(@input_files)==0; | |
| 196 die "Error: need more than one input file to join.\n" if scalar(@input_files)==1; | |
| 197 | |
| 198 foreach my $file (@input_files) { | |
| 199 my $file_ID = sanitize_filename($file); | |
| 200 $file_labels{$file} = $file_ID; | |
| 201 } | |
| 202 } | |
| 203 | |
| 204 } | |
| 205 | |
| 206 sub show_help() | |
| 207 { | |
| 208 print<<EOF; | |
| 209 Multi-File join, version $version | |
| 210 Copyright (C) 2012 - A. Gordon (gordon at cshl dot edu) | |
| 211 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | |
| 212 | |
| 213 Usage: | |
| 214 multijoin [OPTIONS] -k N -v V1,V2,Vn,.. FILE1 FILE2 ... FILEn | |
| 215 | |
| 216 Options: | |
| 217 | |
| 218 --help This helpful help screen. | |
| 219 | |
| 220 -k N | |
| 221 --key N Use column N as key column. | |
| 222 | |
| 223 -v V1,V2,Vn | |
| 224 --values V1,V2,Vn | |
| 225 Use columns V1,V2,Vn as value columns - those will be joined | |
| 226 According to the Key column. | |
| 227 Multiple columns can be specified. | |
| 228 | |
| 229 -t SEP Use SEP as field separator character (default: tab). | |
| 230 | |
| 231 -h | |
| 232 --out-header Add a header line to the output file. | |
| 233 | |
| 234 --in-header The input files have a header line. | |
| 235 The first line will not be joined. | |
| 236 if '--out-header' is also used, the output column headers will | |
| 237 be constructed based on the input header column names. | |
| 238 | |
| 239 -H | |
| 240 --headers Same as '--in-header --out-header' combined. | |
| 241 | |
| 242 --ignore-dups Ignore duplicated keys (within a file). | |
| 243 By default, duplicated keys cause an error. | |
| 244 | |
| 245 -f X | |
| 246 --filler X Fill missing values with X. | |
| 247 (Default: '$filler'). | |
| 248 | |
|
5
20344ce0c811
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit b9d202134c3c6d0e5c398c3ae75e410067fcfc52
bgruening
parents:
0
diff
changeset
|
249 --labels When printing output headers with '-h', instead of using the file name, |
| 0 | 250 use specific labels. |
| 251 Each file name must be followed by a name. | |
| 252 | |
| 253 example (without labels): | |
| 254 \$ multijoin -h -k 1 -v 2 A.TXT B.TXT C.TXT | |
| 255 | |
| 256 example (with labels): | |
| 257 \$ multijoin -h --labels -k 1 -v 2 A.TXT Sample1 B.TXT SampleB C.TXT SampleC | |
| 258 | |
| 259 --examples Show detailed examples. | |
| 260 | |
| 261 EOF | |
| 262 exit(0); | |
| 263 } | |
| 264 | |
| 265 sub show_examples() | |
| 266 { | |
| 267 print<<EOF; | |
| 268 | |
| 269 To join three files, based on the 4th column, and keeping the 7th,8th,9th columns: | |
| 270 | |
| 271 \$ head *.txt | |
| 272 ==> AAA.txt <== | |
| 273 chr4 888449 890171 FBtr0308778 0 + 266 1527 1722 | |
| 274 chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850 | |
| 275 chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831 | |
| 276 chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831 | |
| 277 chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831 | |
| 278 chr4 995793 996435 FBtr0111046 0 + 7 166 642 | |
| 279 chr4 995793 997931 FBtr0111044 0 + 28 683 2138 | |
| 280 chr4 995793 997931 FBtr0111045 0 + 28 683 2138 | |
| 281 chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690 | |
| 282 | |
| 283 ==> BBB.txt <== | |
| 284 chr4 90286 134453 FBtr0309803 0 + 657 29084 44167 | |
| 285 chr4 251355 266499 FBtr0089116 0 + 56 1296 15144 | |
| 286 chr4 252050 266506 FBtr0308086 0 + 56 1296 14456 | |
| 287 chr4 252050 266506 FBtr0308087 0 + 56 1296 14456 | |
| 288 chr4 252053 266528 FBtr0300796 0 + 56 1296 14475 | |
| 289 chr4 252053 266528 FBtr0300800 0 + 56 1296 14475 | |
| 290 chr4 252055 266528 FBtr0300798 0 + 56 1296 14473 | |
| 291 chr4 252055 266528 FBtr0300799 0 + 56 1296 14473 | |
| 292 chr4 252541 266528 FBtr0300797 0 + 56 1296 13987 | |
| 293 | |
| 294 ==> CCC.txt <== | |
| 295 chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850 | |
| 296 chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831 | |
| 297 chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831 | |
| 298 chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831 | |
| 299 chr4 995793 996435 FBtr0111046 0 + 5 304 642 | |
| 300 chr4 995793 997931 FBtr0111044 0 + 17 714 2138 | |
| 301 chr4 995793 997931 FBtr0111045 0 + 17 714 2138 | |
| 302 chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690 | |
| 303 | |
| 304 \$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10 | |
| 305 key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9 | |
| 306 FBtr0089116 0 0 0 56 1296 15144 0 0 0 | |
| 307 FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690 | |
| 308 FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831 | |
| 309 FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831 | |
| 310 FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831 | |
| 311 FBtr0111044 28 683 2138 0 0 0 17 714 2138 | |
| 312 FBtr0111045 28 683 2138 0 0 0 17 714 2138 | |
| 313 FBtr0111046 7 166 642 0 0 0 5 304 642 | |
| 314 FBtr0300796 0 0 0 56 1296 14475 0 0 0 | |
| 315 | |
| 316 | |
| 317 | |
| 318 EOF | |
| 319 exit(0); | |
| 320 } |
