Mercurial > repos > geert-vandeweyer > advanced_grep_from_file
comparison GrepFile.pl @ 1:dba6ffec8e2e draft
Uploaded
| author | geert-vandeweyer |
|---|---|
| date | Thu, 13 Feb 2014 08:37:30 -0500 |
| parents | |
| children | 606e24c6fda0 |
comparison
equal
deleted
inserted
replaced
| 0:fd6b71c81011 | 1:dba6ffec8e2e |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 # load modules | |
| 4 use Getopt::Std; | |
| 5 use threads; | |
| 6 use Thread::Queue; | |
| 7 use threads::shared; | |
| 8 | |
| 9 $now = time ; | |
| 10 | |
| 11 # opts | |
| 12 # i : infile | |
| 13 # f : patternfile | |
| 14 # o : output file | |
| 15 # t : type (file/single) | |
| 16 # I : Insenstive to case | |
| 17 # P : Perl-Based Grep (boolean) | |
| 18 # A : number of extra lines to fetch | |
| 19 getopts('i:f:o:t:PIA:', \%opts) ; | |
| 20 | |
| 21 ## variables for threads. | |
| 22 my $infile :shared; | |
| 23 my $outfile :shared; | |
| 24 my $args :shared; | |
| 25 my $rand :shared; | |
| 26 my $tmpdir :shared; | |
| 27 | |
| 28 ## nr of grep threads (if created), one extra is created for printing. | |
| 29 my $nrgrep = 6; | |
| 30 | |
| 31 | |
| 32 ## infile && outfile check | |
| 33 if (!defined($opts{'i'})) { die('Input file is mandatory');} | |
| 34 $infile = $opts{'i'}; | |
| 35 if (!defined($opts{'o'})) { die('Output file is mandatory');} | |
| 36 $outfile = $opts{'o'}; | |
| 37 | |
| 38 #$args = ''; | |
| 39 if (defined($opts{'I'})) { | |
| 40 $args .= ' -i'; | |
| 41 } | |
| 42 if (defined($opts{'P'})) { | |
| 43 $args .= " -P"; | |
| 44 } | |
| 45 if (defined($opts{'A'})) { | |
| 46 if ($opts{'A'} =~ m/^\d+$/) { | |
| 47 $args .= " -A $opts{'A'}"; | |
| 48 } | |
| 49 else { | |
| 50 die("Invalid amount of lines to fetch provided (must be integer)"); | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 ## create tmp location & outfile. | |
| 55 $rand = int(rand(10000)); | |
| 56 while (-d "/tmp/GrepFile.$rand") { | |
| 57 $rand = int(rand(10000)); | |
| 58 } | |
| 59 mkdir("/tmp/GrepFile.$rand"); | |
| 60 $tmpdir = "/tmp/GrepFile.$rand"; | |
| 61 | |
| 62 | |
| 63 ## type? | |
| 64 if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');} | |
| 65 $type = $opts{'t'}; | |
| 66 | |
| 67 ## only threads for file based patterns | |
| 68 if ($type eq 'file') { | |
| 69 $grepqueue = Thread::Queue->new(); | |
| 70 $printqueue = Thread::Queue->new(); | |
| 71 for ($i = 1; $i <= $nrgrep; $i++) { | |
| 72 ${"grep$i"} = threads->create('grep'); | |
| 73 } | |
| 74 $print = threads->create('printout'); | |
| 75 } | |
| 76 | |
| 77 ## single pattern => direct processing | |
| 78 if ($type eq 'single') { | |
| 79 $patt = $opts{'f'}; | |
| 80 system("grep $args '$patt' $infile > $tmpdir/result.txt"); | |
| 81 | |
| 82 } | |
| 83 elsif ($type eq 'file') { | |
| 84 $pattfile = $opts{'f'}; | |
| 85 ## make sure patterns are unique | |
| 86 open IN, "$pattfile"; | |
| 87 my %pats; | |
| 88 while (<IN>) { | |
| 89 chomp($_); | |
| 90 $pats{$_} = 1; | |
| 91 } | |
| 92 close IN; | |
| 93 ## copy infile to local system for speed | |
| 94 system("cp '$infile' '$tmpdir/infile'"); | |
| 95 mkdir("$tmpdir/pattfiles"); | |
| 96 ## run in batches of 100 patterns. | |
| 97 my @patterns = keys(%pats); | |
| 98 $idx = 0; | |
| 99 while (my @subset = splice(@patterns,0,100)) { | |
| 100 if (scalar(@subset) == 0) { | |
| 101 last; | |
| 102 } | |
| 103 $idx++; | |
| 104 open OUT, ">$tmpdir/pattfiles/$idx"; | |
| 105 print OUT join("\n",@subset); | |
| 106 close OUT; | |
| 107 $grepqueue->enqueue($idx); | |
| 108 } | |
| 109 for ($i = 1; $i <= $nrgrep; $i++) { | |
| 110 $grepqueue->enqueue(undef); | |
| 111 } | |
| 112 } | |
| 113 else { | |
| 114 die('only "file" and "single" are supported as value of the -t flag'); | |
| 115 } | |
| 116 | |
| 117 for ($i = 1; $i<= $nrgrep ; $i++) { | |
| 118 ${"grep$i"}->join(); | |
| 119 } | |
| 120 $printqueue->enqueue(undef); | |
| 121 $print->join(); | |
| 122 | |
| 123 system("cp $tmpdir/result.txt '$outfile'"); | |
| 124 | |
| 125 system("rm -Rf $tmpdir"); | |
| 126 | |
| 127 ################## | |
| 128 # PRINT RUN-TIME # | |
| 129 ################## | |
| 130 $now = time - $now; | |
| 131 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); | |
| 132 | |
| 133 sub grep { | |
| 134 #local copies | |
| 135 my $in = $infile; | |
| 136 my $largs = $args; | |
| 137 my $ltmp = $tmpdir; | |
| 138 while (defined(my $idx = $grepqueue->dequeue())) { | |
| 139 $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'"; | |
| 140 my $out = `$command`; | |
| 141 $printqueue->enqueue($out); | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 sub printout { | |
| 146 $counter = 0; | |
| 147 $output = ''; | |
| 148 while (defined(my $result = $printqueue->dequeue())) { | |
| 149 if ($result ne '' && $result ne "\n") { | |
| 150 $output .= $result; | |
| 151 $counter++; | |
| 152 } | |
| 153 if ($counter > 50) { | |
| 154 open OUT, ">>$tmpdir/result.txt"; | |
| 155 print OUT $output; | |
| 156 close OUT; | |
| 157 $output = ''; | |
| 158 $counter = 0; | |
| 159 } | |
| 160 } | |
| 161 if ($output ne '') { | |
| 162 open OUT, ">>$tmpdir/result.txt"; | |
| 163 print OUT $output; | |
| 164 close OUT; | |
| 165 } | |
| 166 } | |
| 167 |
