Mercurial > repos > xuebing > sharplabtool
comparison tools/unix_tools/word_list_grep.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Std; | |
5 | |
6 sub parse_command_line(); | |
7 sub load_word_list(); | |
8 sub compile_regex(@); | |
9 sub usage(); | |
10 | |
11 my $word_list_file; | |
12 my $input_file ; | |
13 my $output_file; | |
14 my $find_complete_words ; | |
15 my $find_inverse; | |
16 my $find_in_specific_column ; | |
17 my $find_case_insensitive ; | |
18 my $skip_first_line ; | |
19 | |
20 | |
21 ## | |
22 ## Program Start | |
23 ## | |
24 usage() if @ARGV==0; | |
25 parse_command_line(); | |
26 | |
27 my @words = load_word_list(); | |
28 | |
29 my $regex = compile_regex(@words); | |
30 | |
31 # Allow first line to pass without filtering? | |
32 if ( $skip_first_line ) { | |
33 my $line = <$input_file>; | |
34 print $output_file $line ; | |
35 } | |
36 | |
37 | |
38 ## | |
39 ## Main loop | |
40 ## | |
41 while ( <$input_file> ) { | |
42 my $target = $_; | |
43 | |
44 | |
45 # If searching in a specific column (and not in the entire line) | |
46 # extract the content of that one column | |
47 if ( $find_in_specific_column ) { | |
48 my @columns = split ; | |
49 | |
50 #not enough columns in this line - skip it | |
51 next if ( @columns < $find_in_specific_column ) ; | |
52 | |
53 $target = $columns [ $find_in_specific_column - 1 ] ; | |
54 } | |
55 | |
56 # Match ? | |
57 if ( ($target =~ $regex) ^ ($find_inverse) ) { | |
58 print $output_file $_ ; | |
59 } | |
60 } | |
61 | |
62 close $input_file; | |
63 close $output_file; | |
64 | |
65 ## | |
66 ## Program end | |
67 ## | |
68 | |
69 | |
70 sub parse_command_line() | |
71 { | |
72 my %opts ; | |
73 getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n"; | |
74 | |
75 die "$0: missing word-list file name\n" if (@ARGV==0); | |
76 | |
77 $word_list_file = $ARGV[0]; | |
78 die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ; | |
79 | |
80 $find_complete_words = ( exists $opts{w} ) ; | |
81 $find_inverse = ( exists $opts{v} ) ; | |
82 $find_case_insensitive = ( exists $opts{i} ) ; | |
83 $skip_first_line = ( exists $opts{s} ) ; | |
84 | |
85 | |
86 # Search in specific column ? | |
87 if ( defined $opts{c} ) { | |
88 $find_in_specific_column = $opts{c}; | |
89 | |
90 die "$0: invalid column number ($find_in_specific_column).\n" | |
91 unless $find_in_specific_column =~ /^\d+$/ ; | |
92 | |
93 die "$0: invalid column number ($find_in_specific_column).\n" | |
94 if $find_in_specific_column <= 0; | |
95 } | |
96 else { | |
97 $find_in_specific_column = 0 ; | |
98 } | |
99 | |
100 | |
101 # Output File specified (instead of STDOUT) ? | |
102 if ( defined $opts{o} ) { | |
103 my $filename = $opts{o}; | |
104 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; | |
105 } else { | |
106 $output_file = *STDOUT ; | |
107 } | |
108 | |
109 | |
110 | |
111 # Input file Specified (instead of STDIN) ? | |
112 if ( @ARGV>1 ) { | |
113 my $filename = $ARGV[1]; | |
114 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; | |
115 } else { | |
116 $input_file = *STDIN; | |
117 } | |
118 } | |
119 | |
120 sub load_word_list() | |
121 { | |
122 open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ; | |
123 my @words ; | |
124 while ( <WORDLIST> ) { | |
125 chomp ; | |
126 s/^\s+//; | |
127 s/\s+$//; | |
128 next if length==0; | |
129 push @words,quotemeta $_; | |
130 } | |
131 close WORDLIST; | |
132 | |
133 die "$0: Error: word-list file '$word_list_file' is empty!\n" | |
134 unless @words; | |
135 | |
136 return @words; | |
137 } | |
138 | |
139 sub compile_regex(@) | |
140 { | |
141 my @words = @_; | |
142 | |
143 my $regex_string = join ( '|', @words ) ; | |
144 if ( $find_complete_words ) { | |
145 $regex_string = "\\b($regex_string)\\b"; | |
146 } | |
147 my $regex; | |
148 | |
149 if ( $find_case_insensitive ) { | |
150 $regex = qr/$regex_string/i ; | |
151 } else { | |
152 $regex = qr/$regex_string/; | |
153 } | |
154 | |
155 return $regex; | |
156 } | |
157 | |
158 sub usage() | |
159 { | |
160 print <<EOF; | |
161 | |
162 Word-List Grep | |
163 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) | |
164 | |
165 Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE] | |
166 | |
167 -s - do not filter first line - always output the first line from the input file. | |
168 -w - search for complete words (not partial sub-strings). | |
169 -i - case insensitive search. | |
170 -v - inverse - output lines NOT matching the word list. | |
171 -c N - check only column N, instead of entire line (line split by whitespace). | |
172 -o OUT - specify output file (default = STDOUT). | |
173 WORD-LIST-FILE - file containing one word per line. These will be used | |
174 for the search. | |
175 INPUT-FILE - (optional) read from file (default = from STDIN). | |
176 | |
177 | |
178 | |
179 EOF | |
180 | |
181 exit; | |
182 } |