diff GrepFile.pl @ 1:dba6ffec8e2e draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:37:30 -0500
parents
children 606e24c6fda0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GrepFile.pl	Thu Feb 13 08:37:30 2014 -0500
@@ -0,0 +1,167 @@
+#!/usr/bin/perl
+
+# load modules
+use Getopt::Std;
+use threads;
+use Thread::Queue;
+use threads::shared;
+
+$now = time ;
+
+# opts
+# i : infile
+# f : patternfile
+# o : output file 
+# t : type (file/single)
+# I : Insenstive to case
+# P : Perl-Based Grep (boolean)
+# A : number of extra lines to fetch
+getopts('i:f:o:t:PIA:', \%opts) ;
+
+## variables for threads.
+my $infile :shared;
+my $outfile :shared;
+my $args :shared;
+my $rand :shared;
+my $tmpdir :shared;
+
+## nr of grep threads (if created), one extra is created for printing.
+my $nrgrep = 6;
+
+
+## infile && outfile check
+if (!defined($opts{'i'})) { die('Input file is mandatory');}
+$infile = $opts{'i'};
+if (!defined($opts{'o'})) { die('Output file is mandatory');}
+$outfile = $opts{'o'};
+
+#$args = '';
+if (defined($opts{'I'})) {
+	$args .= ' -i';
+}
+if (defined($opts{'P'})) {
+	$args .= " -P";
+}
+if (defined($opts{'A'})) {
+	if ($opts{'A'} =~ m/^\d+$/) {
+		$args .= " -A $opts{'A'}";
+	}
+	else {
+		die("Invalid amount of lines to fetch provided (must be integer)");
+	}
+}
+
+## create tmp location & outfile.
+$rand = int(rand(10000));
+while (-d "/tmp/GrepFile.$rand") {
+	$rand = int(rand(10000));
+}	
+mkdir("/tmp/GrepFile.$rand");
+$tmpdir = "/tmp/GrepFile.$rand";
+
+
+## type? 
+if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');}
+$type = $opts{'t'};
+
+## only threads for file based patterns
+if ($type eq 'file') {
+	$grepqueue = Thread::Queue->new();
+	$printqueue = Thread::Queue->new();
+	for ($i = 1; $i <= $nrgrep; $i++) {
+		${"grep$i"} = threads->create('grep');
+	}
+	$print = threads->create('printout');
+}
+
+## single pattern => direct processing
+if ($type eq 'single') {
+	$patt = $opts{'f'};
+	system("grep $args '$patt' $infile > $tmpdir/result.txt");
+	
+}
+elsif ($type eq 'file') {
+	$pattfile = $opts{'f'};
+	## make sure patterns are unique
+	open IN, "$pattfile";
+	my %pats;
+	while (<IN>) {
+		chomp($_);
+		$pats{$_} = 1;
+	}
+	close IN;
+	## copy infile to local system for speed
+	system("cp '$infile' '$tmpdir/infile'");
+	mkdir("$tmpdir/pattfiles");
+	## run in batches of 100 patterns.
+	my @patterns = keys(%pats);
+	$idx = 0;
+	while (my @subset = splice(@patterns,0,100)) {
+		if (scalar(@subset) == 0) {
+			last;
+		}
+		$idx++;
+		open OUT, ">$tmpdir/pattfiles/$idx";
+		print OUT join("\n",@subset);
+		close OUT;
+		$grepqueue->enqueue($idx);
+	}
+	for ($i = 1; $i <= $nrgrep; $i++) {
+		$grepqueue->enqueue(undef);
+	}
+}
+else {
+	die('only "file" and "single" are supported as value of the -t flag');
+}
+
+for ($i = 1; $i<= $nrgrep ; $i++) {
+	${"grep$i"}->join();
+}
+$printqueue->enqueue(undef);
+$print->join();
+
+system("cp $tmpdir/result.txt '$outfile'");
+
+system("rm -Rf $tmpdir");
+
+##################
+# PRINT RUN-TIME #
+##################
+$now = time - $now;
+printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
+
+sub grep {
+	#local copies
+	my $in = $infile;
+	my $largs = $args;
+	my $ltmp = $tmpdir;
+	while (defined(my $idx = $grepqueue->dequeue())) {
+		$command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'";
+		my $out = `$command`;
+		$printqueue->enqueue($out);
+	}
+}
+
+sub printout {
+	$counter = 0;
+	$output = '';
+	while (defined(my $result = $printqueue->dequeue())) {
+		if ($result ne '' && $result ne "\n") {
+			$output .= $result;
+			$counter++;
+		}
+		if ($counter > 50) {
+			open OUT, ">>$tmpdir/result.txt";
+			print OUT $output;
+			close OUT;
+			$output = '';
+			$counter = 0;
+		} 
+	}
+	if ($output ne '') {
+		open OUT, ">>$tmpdir/result.txt";
+		print OUT $output;
+		close OUT;
+	}
+}
+