Mercurial > repos > xuebing > sharplabtool
view tools/unix_tools/word_list_grep.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use Getopt::Std; sub parse_command_line(); sub load_word_list(); sub compile_regex(@); sub usage(); my $word_list_file; my $input_file ; my $output_file; my $find_complete_words ; my $find_inverse; my $find_in_specific_column ; my $find_case_insensitive ; my $skip_first_line ; ## ## Program Start ## usage() if @ARGV==0; parse_command_line(); my @words = load_word_list(); my $regex = compile_regex(@words); # Allow first line to pass without filtering? if ( $skip_first_line ) { my $line = <$input_file>; print $output_file $line ; } ## ## Main loop ## while ( <$input_file> ) { my $target = $_; # If searching in a specific column (and not in the entire line) # extract the content of that one column if ( $find_in_specific_column ) { my @columns = split ; #not enough columns in this line - skip it next if ( @columns < $find_in_specific_column ) ; $target = $columns [ $find_in_specific_column - 1 ] ; } # Match ? if ( ($target =~ $regex) ^ ($find_inverse) ) { print $output_file $_ ; } } close $input_file; close $output_file; ## ## Program end ## sub parse_command_line() { my %opts ; getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n"; die "$0: missing word-list file name\n" if (@ARGV==0); $word_list_file = $ARGV[0]; die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ; $find_complete_words = ( exists $opts{w} ) ; $find_inverse = ( exists $opts{v} ) ; $find_case_insensitive = ( exists $opts{i} ) ; $skip_first_line = ( exists $opts{s} ) ; # Search in specific column ? if ( defined $opts{c} ) { $find_in_specific_column = $opts{c}; die "$0: invalid column number ($find_in_specific_column).\n" unless $find_in_specific_column =~ /^\d+$/ ; die "$0: invalid column number ($find_in_specific_column).\n" if $find_in_specific_column <= 0; } else { $find_in_specific_column = 0 ; } # Output File specified (instead of STDOUT) ? if ( defined $opts{o} ) { my $filename = $opts{o}; open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; } else { $output_file = *STDOUT ; } # Input file Specified (instead of STDIN) ? if ( @ARGV>1 ) { my $filename = $ARGV[1]; open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; } else { $input_file = *STDIN; } } sub load_word_list() { open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ; my @words ; while ( <WORDLIST> ) { chomp ; s/^\s+//; s/\s+$//; next if length==0; push @words,quotemeta $_; } close WORDLIST; die "$0: Error: word-list file '$word_list_file' is empty!\n" unless @words; return @words; } sub compile_regex(@) { my @words = @_; my $regex_string = join ( '|', @words ) ; if ( $find_complete_words ) { $regex_string = "\\b($regex_string)\\b"; } my $regex; if ( $find_case_insensitive ) { $regex = qr/$regex_string/i ; } else { $regex = qr/$regex_string/; } return $regex; } sub usage() { print <<EOF; Word-List Grep Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE] -s - do not filter first line - always output the first line from the input file. -w - search for complete words (not partial sub-strings). -i - case insensitive search. -v - inverse - output lines NOT matching the word list. -c N - check only column N, instead of entire line (line split by whitespace). -o OUT - specify output file (default = STDOUT). WORD-LIST-FILE - file containing one word per line. These will be used for the search. INPUT-FILE - (optional) read from file (default = from STDIN). EOF exit; }