comparison tools/unix_tools/word_list_grep.pl @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Std;
5
6 sub parse_command_line();
7 sub load_word_list();
8 sub compile_regex(@);
9 sub usage();
10
11 my $word_list_file;
12 my $input_file ;
13 my $output_file;
14 my $find_complete_words ;
15 my $find_inverse;
16 my $find_in_specific_column ;
17 my $find_case_insensitive ;
18 my $skip_first_line ;
19
20
21 ##
22 ## Program Start
23 ##
24 usage() if @ARGV==0;
25 parse_command_line();
26
27 my @words = load_word_list();
28
29 my $regex = compile_regex(@words);
30
31 # Allow first line to pass without filtering?
32 if ( $skip_first_line ) {
33 my $line = <$input_file>;
34 print $output_file $line ;
35 }
36
37
38 ##
39 ## Main loop
40 ##
41 while ( <$input_file> ) {
42 my $target = $_;
43
44
45 # If searching in a specific column (and not in the entire line)
46 # extract the content of that one column
47 if ( $find_in_specific_column ) {
48 my @columns = split ;
49
50 #not enough columns in this line - skip it
51 next if ( @columns < $find_in_specific_column ) ;
52
53 $target = $columns [ $find_in_specific_column - 1 ] ;
54 }
55
56 # Match ?
57 if ( ($target =~ $regex) ^ ($find_inverse) ) {
58 print $output_file $_ ;
59 }
60 }
61
62 close $input_file;
63 close $output_file;
64
65 ##
66 ## Program end
67 ##
68
69
70 sub parse_command_line()
71 {
72 my %opts ;
73 getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
74
75 die "$0: missing word-list file name\n" if (@ARGV==0);
76
77 $word_list_file = $ARGV[0];
78 die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
79
80 $find_complete_words = ( exists $opts{w} ) ;
81 $find_inverse = ( exists $opts{v} ) ;
82 $find_case_insensitive = ( exists $opts{i} ) ;
83 $skip_first_line = ( exists $opts{s} ) ;
84
85
86 # Search in specific column ?
87 if ( defined $opts{c} ) {
88 $find_in_specific_column = $opts{c};
89
90 die "$0: invalid column number ($find_in_specific_column).\n"
91 unless $find_in_specific_column =~ /^\d+$/ ;
92
93 die "$0: invalid column number ($find_in_specific_column).\n"
94 if $find_in_specific_column <= 0;
95 }
96 else {
97 $find_in_specific_column = 0 ;
98 }
99
100
101 # Output File specified (instead of STDOUT) ?
102 if ( defined $opts{o} ) {
103 my $filename = $opts{o};
104 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
105 } else {
106 $output_file = *STDOUT ;
107 }
108
109
110
111 # Input file Specified (instead of STDIN) ?
112 if ( @ARGV>1 ) {
113 my $filename = $ARGV[1];
114 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
115 } else {
116 $input_file = *STDIN;
117 }
118 }
119
120 sub load_word_list()
121 {
122 open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
123 my @words ;
124 while ( <WORDLIST> ) {
125 chomp ;
126 s/^\s+//;
127 s/\s+$//;
128 next if length==0;
129 push @words,quotemeta $_;
130 }
131 close WORDLIST;
132
133 die "$0: Error: word-list file '$word_list_file' is empty!\n"
134 unless @words;
135
136 return @words;
137 }
138
139 sub compile_regex(@)
140 {
141 my @words = @_;
142
143 my $regex_string = join ( '|', @words ) ;
144 if ( $find_complete_words ) {
145 $regex_string = "\\b($regex_string)\\b";
146 }
147 my $regex;
148
149 if ( $find_case_insensitive ) {
150 $regex = qr/$regex_string/i ;
151 } else {
152 $regex = qr/$regex_string/;
153 }
154
155 return $regex;
156 }
157
158 sub usage()
159 {
160 print <<EOF;
161
162 Word-List Grep
163 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
164
165 Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
166
167 -s - do not filter first line - always output the first line from the input file.
168 -w - search for complete words (not partial sub-strings).
169 -i - case insensitive search.
170 -v - inverse - output lines NOT matching the word list.
171 -c N - check only column N, instead of entire line (line split by whitespace).
172 -o OUT - specify output file (default = STDOUT).
173 WORD-LIST-FILE - file containing one word per line. These will be used
174 for the search.
175 INPUT-FILE - (optional) read from file (default = from STDIN).
176
177
178
179 EOF
180
181 exit;
182 }