Mercurial > repos > geert-vandeweyer > advanced_grep_from_file
view GrepFile.pl @ 2:36fa1f424923 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:37:36 -0500 |
parents | dba6ffec8e2e |
children | 606e24c6fda0 |
line wrap: on
line source
#!/usr/bin/perl # load modules use Getopt::Std; use threads; use Thread::Queue; use threads::shared; $now = time ; # opts # i : infile # f : patternfile # o : output file # t : type (file/single) # I : Insenstive to case # P : Perl-Based Grep (boolean) # A : number of extra lines to fetch getopts('i:f:o:t:PIA:', \%opts) ; ## variables for threads. my $infile :shared; my $outfile :shared; my $args :shared; my $rand :shared; my $tmpdir :shared; ## nr of grep threads (if created), one extra is created for printing. my $nrgrep = 6; ## infile && outfile check if (!defined($opts{'i'})) { die('Input file is mandatory');} $infile = $opts{'i'}; if (!defined($opts{'o'})) { die('Output file is mandatory');} $outfile = $opts{'o'}; #$args = ''; if (defined($opts{'I'})) { $args .= ' -i'; } if (defined($opts{'P'})) { $args .= " -P"; } if (defined($opts{'A'})) { if ($opts{'A'} =~ m/^\d+$/) { $args .= " -A $opts{'A'}"; } else { die("Invalid amount of lines to fetch provided (must be integer)"); } } ## create tmp location & outfile. $rand = int(rand(10000)); while (-d "/tmp/GrepFile.$rand") { $rand = int(rand(10000)); } mkdir("/tmp/GrepFile.$rand"); $tmpdir = "/tmp/GrepFile.$rand"; ## type? if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');} $type = $opts{'t'}; ## only threads for file based patterns if ($type eq 'file') { $grepqueue = Thread::Queue->new(); $printqueue = Thread::Queue->new(); for ($i = 1; $i <= $nrgrep; $i++) { ${"grep$i"} = threads->create('grep'); } $print = threads->create('printout'); } ## single pattern => direct processing if ($type eq 'single') { $patt = $opts{'f'}; system("grep $args '$patt' $infile > $tmpdir/result.txt"); } elsif ($type eq 'file') { $pattfile = $opts{'f'}; ## make sure patterns are unique open IN, "$pattfile"; my %pats; while (<IN>) { chomp($_); $pats{$_} = 1; } close IN; ## copy infile to local system for speed system("cp '$infile' '$tmpdir/infile'"); mkdir("$tmpdir/pattfiles"); ## run in batches of 100 patterns. my @patterns = keys(%pats); $idx = 0; while (my @subset = splice(@patterns,0,100)) { if (scalar(@subset) == 0) { last; } $idx++; open OUT, ">$tmpdir/pattfiles/$idx"; print OUT join("\n",@subset); close OUT; $grepqueue->enqueue($idx); } for ($i = 1; $i <= $nrgrep; $i++) { $grepqueue->enqueue(undef); } } else { die('only "file" and "single" are supported as value of the -t flag'); } for ($i = 1; $i<= $nrgrep ; $i++) { ${"grep$i"}->join(); } $printqueue->enqueue(undef); $print->join(); system("cp $tmpdir/result.txt '$outfile'"); system("rm -Rf $tmpdir"); ################## # PRINT RUN-TIME # ################## $now = time - $now; printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); sub grep { #local copies my $in = $infile; my $largs = $args; my $ltmp = $tmpdir; while (defined(my $idx = $grepqueue->dequeue())) { $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'"; my $out = `$command`; $printqueue->enqueue($out); } } sub printout { $counter = 0; $output = ''; while (defined(my $result = $printqueue->dequeue())) { if ($result ne '' && $result ne "\n") { $output .= $result; $counter++; } if ($counter > 50) { open OUT, ">>$tmpdir/result.txt"; print OUT $output; close OUT; $output = ''; $counter = 0; } } if ($output ne '') { open OUT, ">>$tmpdir/result.txt"; print OUT $output; close OUT; } }