Mercurial > repos > bgruening > text_processing
annotate find_and_replace @ 24:c41d78ae5fee draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 4dd118c84ed4d6157303e71438c24446ec4b4f31
| author | bgruening |
|---|---|
| date | Wed, 04 Jun 2025 15:12:29 +0000 |
| parents | fb4ff3c42cd3 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env perl |
| 2 use strict; | |
| 3 use warnings; | |
| 4 use Getopt::Std; | |
| 5 | |
| 6 sub parse_command_line(); | |
| 7 sub build_regex_string(); | |
| 8 sub usage(); | |
| 9 | |
| 10 my $input_file ; | |
| 11 my $output_file; | |
| 12 my $find_pattern ; | |
| 13 my $replace_pattern ; | |
| 14 my $find_complete_words ; | |
| 15 my $find_pattern_is_regex ; | |
| 16 my $find_in_specific_column ; | |
| 17 my $find_case_insensitive ; | |
| 18 my $replace_global ; | |
| 19 my $skip_first_line ; | |
| 20 | |
| 21 | |
| 22 ## | |
| 23 ## Program Start | |
| 24 ## | |
| 25 usage() if @ARGV<2; | |
| 26 parse_command_line(); | |
| 27 my $regex_string = build_regex_string() ; | |
| 28 | |
| 29 # Allow first line to pass without filtering? | |
| 30 if ( $skip_first_line ) { | |
| 31 my $line = <$input_file>; | |
| 32 print $output_file $line ; | |
| 33 } | |
| 34 | |
| 35 | |
| 36 ## | |
| 37 ## Main loop | |
| 38 ## | |
| 39 | |
| 40 ## I LOVE PERL (and hate it, at the same time...) | |
| 41 ## | |
| 42 ## So what's going on with the self-compiling perl code? | |
| 43 ## | |
| 44 ## 1. The program gets the find-pattern and the replace-pattern from the user (as strings). | |
| 45 ## 2. If both the find-pattern and replace-pattern are simple strings (not regex), | |
| 46 ## it would be possible to pre-compile a regex (with qr//) and use it in a 's///' | |
| 47 ## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references) | |
| 48 ## it is still possible to pre-compile the regex and use it in a 's///' | |
| 49 ## However, | |
| 50 ## 4. If the replace-pattern contains back-references, pre-compiling is not possible. | |
| 51 ## (in perl, you can't precompile a substitute regex). | |
| 52 ## See these examples: | |
| 53 ## http://www.perlmonks.org/?node_id=84420 | |
| 54 ## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl | |
| 55 ## | |
| 56 ## The solution: | |
| 57 ## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ), | |
| 58 ## Then eval() a new perl code that contains the substitution regex as inlined code. | |
| 59 ## Gotta love perl! | |
| 60 | |
| 61 my $perl_program ; | |
| 62 if ( $find_in_specific_column ) { | |
| 63 # Find & replace in specific column | |
| 64 | |
| 65 $perl_program = <<EOF; | |
| 66 while ( <STDIN> ) { | |
| 67 chomp ; | |
|
14
fb4ff3c42cd3
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 09b22cceacb34dd4c6c1b42890f93232df128208"
bgruening
parents:
0
diff
changeset
|
68 my \@columns = split /\t/; |
| 0 | 69 |
| 70 #not enough columns in this line - skip it | |
| 71 next if ( \@columns < $find_in_specific_column ) ; | |
| 72 | |
| 73 \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ; | |
| 74 | |
| 75 print STDOUT join("\t", \@columns), "\n" ; | |
| 76 } | |
| 77 EOF | |
| 78 | |
| 79 } else { | |
| 80 # Find & replace the entire line | |
| 81 $perl_program = <<EOF; | |
| 82 while ( <STDIN> ) { | |
| 83 $regex_string ; | |
| 84 print STDOUT; | |
| 85 } | |
| 86 EOF | |
| 87 } | |
| 88 | |
| 89 | |
| 90 # The dynamic perl code reads from STDIN and writes to STDOUT, | |
| 91 # so connect these handles (if the user didn't specifiy input / output | |
| 92 # file names, these might be already be STDIN/OUT, so the whole could be a no-op). | |
| 93 *STDIN = $input_file ; | |
| 94 *STDOUT = $output_file ; | |
| 95 eval $perl_program ; | |
| 96 | |
| 97 | |
| 98 ## | |
| 99 ## Program end | |
| 100 ## | |
| 101 | |
| 102 | |
| 103 sub parse_command_line() | |
| 104 { | |
| 105 my %opts ; | |
| 106 getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n"; | |
| 107 | |
| 108 die "$0: missing Find-Pattern argument\n" if (@ARGV==0); | |
| 109 $find_pattern = $ARGV[0]; | |
| 110 die "$0: missing Replace-Pattern argument\n" if (@ARGV==1); | |
| 111 $replace_pattern = $ARGV[1]; | |
| 112 | |
| 113 $find_complete_words = ( exists $opts{w} ) ; | |
| 114 $find_case_insensitive = ( exists $opts{i} ) ; | |
| 115 $skip_first_line = ( exists $opts{s} ) ; | |
| 116 $find_pattern_is_regex = ( exists $opts{r} ) ; | |
| 117 $replace_global = ( exists $opts{g} ) ; | |
| 118 | |
| 119 # Search in specific column ? | |
| 120 if ( defined $opts{c} ) { | |
| 121 $find_in_specific_column = $opts{c}; | |
| 122 | |
| 123 die "$0: invalid column number ($find_in_specific_column).\n" | |
| 124 unless $find_in_specific_column =~ /^\d+$/ ; | |
| 125 | |
| 126 die "$0: invalid column number ($find_in_specific_column).\n" | |
| 127 if $find_in_specific_column <= 0; | |
| 128 } | |
| 129 else { | |
| 130 $find_in_specific_column = 0 ; | |
| 131 } | |
| 132 | |
| 133 # Output File specified (instead of STDOUT) ? | |
| 134 if ( defined $opts{o} ) { | |
| 135 my $filename = $opts{o}; | |
| 136 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; | |
| 137 } else { | |
| 138 $output_file = *STDOUT ; | |
| 139 } | |
| 140 | |
| 141 | |
| 142 # Input file Specified (instead of STDIN) ? | |
| 143 if ( @ARGV>2 ) { | |
| 144 my $filename = $ARGV[2]; | |
| 145 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; | |
| 146 } else { | |
| 147 $input_file = *STDIN; | |
| 148 } | |
| 149 } | |
| 150 | |
| 151 sub build_regex_string() | |
| 152 { | |
| 153 my $find_string ; | |
| 154 my $replace_string ; | |
| 155 | |
| 156 if ( $find_pattern_is_regex ) { | |
| 157 $find_string = $find_pattern ; | |
| 158 $replace_string = $replace_pattern ; | |
| 159 } else { | |
| 160 $find_string = quotemeta $find_pattern ; | |
| 161 $replace_string = quotemeta $replace_pattern; | |
| 162 } | |
| 163 | |
| 164 if ( $find_complete_words ) { | |
| 165 $find_string = "\\b($find_string)\\b"; | |
| 166 } | |
| 167 | |
| 168 my $regex_string = "s/$find_string/$replace_string/"; | |
| 169 | |
| 170 $regex_string .= "i" if ( $find_case_insensitive ); | |
| 171 $regex_string .= "g" if ( $replace_global ) ; | |
| 172 | |
| 173 | |
| 174 return $regex_string; | |
| 175 } | |
| 176 | |
| 177 sub usage() | |
| 178 { | |
| 179 print <<EOF; | |
| 180 | |
| 181 Find and Replace | |
| 182 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) | |
| 183 | |
| 184 Usage: $0 [-o OUTPUT] [-g] [-r] [-w] [-i] [-c N] [-l] FIND-PATTERN REPLACE-PATTERN [INPUT-FILE] | |
| 185 | |
| 186 -g - Global replace - replace all occurences in line/column. | |
| 187 Default - replace just the first instance. | |
| 188 -w - search for complete words (not partial sub-strings). | |
| 189 -i - case insensitive search. | |
| 190 -c N - check only column N, instead of entire line (line split by whitespace). | |
| 191 -l - skip first line (don't replace anything in it) | |
| 192 -r - FIND-PATTERN and REPLACE-PATTERN are perl regular expression, | |
| 193 usable inside a 's///' statement. | |
| 194 By default, they are used as verbatim text strings. | |
| 195 -o OUT - specify output file (default = STDOUT). | |
| 196 INPUT-FILE - (optional) read from file (default = from STDIN). | |
| 197 | |
| 198 | |
| 199 EOF | |
| 200 | |
| 201 exit; | |
| 202 } |
