diff tools/unix_tools/word_list_grep.pl @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/word_list_grep.pl	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,182 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Std;
+
+sub parse_command_line();
+sub load_word_list();
+sub compile_regex(@);
+sub usage();
+
+my $word_list_file;
+my $input_file ;
+my $output_file;
+my $find_complete_words ;
+my $find_inverse; 
+my $find_in_specific_column ;
+my $find_case_insensitive ;
+my $skip_first_line ;
+
+
+##
+## Program Start
+##
+usage() if @ARGV==0;
+parse_command_line();
+
+my @words = load_word_list();
+
+my $regex = compile_regex(@words);
+
+# Allow first line to pass without filtering?
+if ( $skip_first_line ) {
+	my $line = <$input_file>;
+	print $output_file $line ;
+}
+
+
+##
+## Main loop
+##
+while ( <$input_file> ) {
+	my $target = $_;
+
+
+	# If searching in a specific column (and not in the entire line)
+	# extract the content of that one column
+	if ( $find_in_specific_column ) {
+		my @columns = split ;
+
+		#not enough columns in this line - skip it
+		next if ( @columns < $find_in_specific_column ) ;
+
+		$target = $columns [ $find_in_specific_column - 1 ] ;
+	}
+
+	# Match ?
+	if ( ($target =~ $regex) ^ ($find_inverse) ) {
+		print $output_file $_ ;
+	}
+}
+
+close $input_file;
+close $output_file;
+
+##
+## Program end
+##
+
+
+sub parse_command_line()
+{
+	my %opts ;
+	getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
+
+	die "$0: missing word-list file name\n" if (@ARGV==0); 
+
+	$word_list_file = $ARGV[0];
+	die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
+
+	$find_complete_words = ( exists $opts{w} ) ;
+	$find_inverse = ( exists $opts{v} ) ;
+	$find_case_insensitive = ( exists $opts{i} ) ;
+	$skip_first_line = ( exists $opts{s} ) ;
+
+
+	# Search in specific column ?
+	if ( defined $opts{c} ) {
+		$find_in_specific_column = $opts{c};
+
+		die "$0: invalid column number ($find_in_specific_column).\n"
+			unless $find_in_specific_column =~ /^\d+$/ ;
+			
+		die "$0: invalid column number ($find_in_specific_column).\n"
+			if $find_in_specific_column <= 0; 
+	}
+	else {
+		$find_in_specific_column = 0 ;
+	}
+
+
+	# Output File specified (instead of STDOUT) ?
+	if ( defined $opts{o} ) {
+		my $filename = $opts{o};
+		open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
+	} else {
+		$output_file = *STDOUT ;
+	}
+
+
+
+	# Input file Specified (instead of STDIN) ?
+	if ( @ARGV>1 ) {
+		my $filename = $ARGV[1];
+		open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
+	} else {
+		$input_file = *STDIN;
+	}
+}
+
+sub load_word_list()
+{
+	open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
+	my @words ;
+	while ( <WORDLIST> ) {
+		chomp ;
+		s/^\s+//;
+		s/\s+$//;
+		next if length==0;
+		push @words,quotemeta $_;
+	}
+	close WORDLIST;
+
+	die "$0: Error: word-list file '$word_list_file' is empty!\n" 
+       		unless @words;
+
+	return @words;	
+}
+
+sub compile_regex(@)
+{
+	my @words = @_;
+
+	my $regex_string = join ( '|', @words ) ;
+	if ( $find_complete_words ) {
+		$regex_string = "\\b($regex_string)\\b"; 
+	}
+	my $regex;
+
+	if ( $find_case_insensitive ) {
+		$regex = qr/$regex_string/i ;
+	} else {
+		$regex = qr/$regex_string/;
+	}
+
+	return $regex;
+}
+
+sub usage()
+{
+print <<EOF;
+
+Word-List Grep
+Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
+
+Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
+
+   -s   - do not filter first line - always output the first line from the input file.
+   -w   - search for complete words (not partial sub-strings).
+   -i   - case insensitive search.
+   -v   - inverse - output lines NOT matching the word list.
+   -c N - check only column N, instead of entire line (line split by whitespace).
+   -o OUT - specify output file (default = STDOUT).
+   WORD-LIST-FILE - file containing one word per line. These will be used
+          for the search. 
+   INPUT-FILE - (optional) read from file (default = from STDIN).
+
+
+
+EOF
+
+	exit;
+}