0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Std;
|
|
5
|
|
6 sub parse_command_line();
|
|
7 sub load_word_list();
|
|
8 sub compile_regex(@);
|
|
9 sub usage();
|
|
10
|
|
11 my $word_list_file;
|
|
12 my $input_file ;
|
|
13 my $output_file;
|
|
14 my $find_complete_words ;
|
|
15 my $find_inverse;
|
|
16 my $find_in_specific_column ;
|
|
17 my $find_case_insensitive ;
|
|
18 my $skip_first_line ;
|
|
19
|
|
20
|
|
21 ##
|
|
22 ## Program Start
|
|
23 ##
|
|
24 usage() if @ARGV==0;
|
|
25 parse_command_line();
|
|
26
|
|
27 my @words = load_word_list();
|
|
28
|
|
29 my $regex = compile_regex(@words);
|
|
30
|
|
31 # Allow first line to pass without filtering?
|
|
32 if ( $skip_first_line ) {
|
|
33 my $line = <$input_file>;
|
|
34 print $output_file $line ;
|
|
35 }
|
|
36
|
|
37
|
|
38 ##
|
|
39 ## Main loop
|
|
40 ##
|
|
41 while ( <$input_file> ) {
|
|
42 my $target = $_;
|
|
43
|
|
44
|
|
45 # If searching in a specific column (and not in the entire line)
|
|
46 # extract the content of that one column
|
|
47 if ( $find_in_specific_column ) {
|
|
48 my @columns = split ;
|
|
49
|
|
50 #not enough columns in this line - skip it
|
|
51 next if ( @columns < $find_in_specific_column ) ;
|
|
52
|
|
53 $target = $columns [ $find_in_specific_column - 1 ] ;
|
|
54 }
|
|
55
|
|
56 # Match ?
|
|
57 if ( ($target =~ $regex) ^ ($find_inverse) ) {
|
|
58 print $output_file $_ ;
|
|
59 }
|
|
60 }
|
|
61
|
|
62 close $input_file;
|
|
63 close $output_file;
|
|
64
|
|
65 ##
|
|
66 ## Program end
|
|
67 ##
|
|
68
|
|
69
|
|
70 sub parse_command_line()
|
|
71 {
|
|
72 my %opts ;
|
|
73 getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
|
|
74
|
|
75 die "$0: missing word-list file name\n" if (@ARGV==0);
|
|
76
|
|
77 $word_list_file = $ARGV[0];
|
|
78 die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
|
|
79
|
|
80 $find_complete_words = ( exists $opts{w} ) ;
|
|
81 $find_inverse = ( exists $opts{v} ) ;
|
|
82 $find_case_insensitive = ( exists $opts{i} ) ;
|
|
83 $skip_first_line = ( exists $opts{s} ) ;
|
|
84
|
|
85
|
|
86 # Search in specific column ?
|
|
87 if ( defined $opts{c} ) {
|
|
88 $find_in_specific_column = $opts{c};
|
|
89
|
|
90 die "$0: invalid column number ($find_in_specific_column).\n"
|
|
91 unless $find_in_specific_column =~ /^\d+$/ ;
|
|
92
|
|
93 die "$0: invalid column number ($find_in_specific_column).\n"
|
|
94 if $find_in_specific_column <= 0;
|
|
95 }
|
|
96 else {
|
|
97 $find_in_specific_column = 0 ;
|
|
98 }
|
|
99
|
|
100
|
|
101 # Output File specified (instead of STDOUT) ?
|
|
102 if ( defined $opts{o} ) {
|
|
103 my $filename = $opts{o};
|
|
104 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
|
|
105 } else {
|
|
106 $output_file = *STDOUT ;
|
|
107 }
|
|
108
|
|
109
|
|
110
|
|
111 # Input file Specified (instead of STDIN) ?
|
|
112 if ( @ARGV>1 ) {
|
|
113 my $filename = $ARGV[1];
|
|
114 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
|
|
115 } else {
|
|
116 $input_file = *STDIN;
|
|
117 }
|
|
118 }
|
|
119
|
|
120 sub load_word_list()
|
|
121 {
|
|
122 open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
|
|
123 my @words ;
|
|
124 while ( <WORDLIST> ) {
|
|
125 chomp ;
|
|
126 s/^\s+//;
|
|
127 s/\s+$//;
|
|
128 next if length==0;
|
|
129 push @words,quotemeta $_;
|
|
130 }
|
|
131 close WORDLIST;
|
|
132
|
|
133 die "$0: Error: word-list file '$word_list_file' is empty!\n"
|
|
134 unless @words;
|
|
135
|
|
136 return @words;
|
|
137 }
|
|
138
|
|
139 sub compile_regex(@)
|
|
140 {
|
|
141 my @words = @_;
|
|
142
|
|
143 my $regex_string = join ( '|', @words ) ;
|
|
144 if ( $find_complete_words ) {
|
|
145 $regex_string = "\\b($regex_string)\\b";
|
|
146 }
|
|
147 my $regex;
|
|
148
|
|
149 if ( $find_case_insensitive ) {
|
|
150 $regex = qr/$regex_string/i ;
|
|
151 } else {
|
|
152 $regex = qr/$regex_string/;
|
|
153 }
|
|
154
|
|
155 return $regex;
|
|
156 }
|
|
157
|
|
158 sub usage()
|
|
159 {
|
|
160 print <<EOF;
|
|
161
|
|
162 Word-List Grep
|
|
163 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
|
|
164
|
|
165 Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
|
|
166
|
|
167 -s - do not filter first line - always output the first line from the input file.
|
|
168 -w - search for complete words (not partial sub-strings).
|
|
169 -i - case insensitive search.
|
|
170 -v - inverse - output lines NOT matching the word list.
|
|
171 -c N - check only column N, instead of entire line (line split by whitespace).
|
|
172 -o OUT - specify output file (default = STDOUT).
|
|
173 WORD-LIST-FILE - file containing one word per line. These will be used
|
|
174 for the search.
|
|
175 INPUT-FILE - (optional) read from file (default = from STDIN).
|
|
176
|
|
177
|
|
178
|
|
179 EOF
|
|
180
|
|
181 exit;
|
|
182 }
|