Mercurial > repos > geert-vandeweyer > advanced_grep_from_file
comparison GrepFile.pl @ 1:dba6ffec8e2e draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:37:30 -0500 |
parents | |
children | 606e24c6fda0 |
comparison
equal
deleted
inserted
replaced
0:fd6b71c81011 | 1:dba6ffec8e2e |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # load modules | |
4 use Getopt::Std; | |
5 use threads; | |
6 use Thread::Queue; | |
7 use threads::shared; | |
8 | |
9 $now = time ; | |
10 | |
11 # opts | |
12 # i : infile | |
13 # f : patternfile | |
14 # o : output file | |
15 # t : type (file/single) | |
16 # I : Insenstive to case | |
17 # P : Perl-Based Grep (boolean) | |
18 # A : number of extra lines to fetch | |
19 getopts('i:f:o:t:PIA:', \%opts) ; | |
20 | |
21 ## variables for threads. | |
22 my $infile :shared; | |
23 my $outfile :shared; | |
24 my $args :shared; | |
25 my $rand :shared; | |
26 my $tmpdir :shared; | |
27 | |
28 ## nr of grep threads (if created), one extra is created for printing. | |
29 my $nrgrep = 6; | |
30 | |
31 | |
32 ## infile && outfile check | |
33 if (!defined($opts{'i'})) { die('Input file is mandatory');} | |
34 $infile = $opts{'i'}; | |
35 if (!defined($opts{'o'})) { die('Output file is mandatory');} | |
36 $outfile = $opts{'o'}; | |
37 | |
38 #$args = ''; | |
39 if (defined($opts{'I'})) { | |
40 $args .= ' -i'; | |
41 } | |
42 if (defined($opts{'P'})) { | |
43 $args .= " -P"; | |
44 } | |
45 if (defined($opts{'A'})) { | |
46 if ($opts{'A'} =~ m/^\d+$/) { | |
47 $args .= " -A $opts{'A'}"; | |
48 } | |
49 else { | |
50 die("Invalid amount of lines to fetch provided (must be integer)"); | |
51 } | |
52 } | |
53 | |
54 ## create tmp location & outfile. | |
55 $rand = int(rand(10000)); | |
56 while (-d "/tmp/GrepFile.$rand") { | |
57 $rand = int(rand(10000)); | |
58 } | |
59 mkdir("/tmp/GrepFile.$rand"); | |
60 $tmpdir = "/tmp/GrepFile.$rand"; | |
61 | |
62 | |
63 ## type? | |
64 if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');} | |
65 $type = $opts{'t'}; | |
66 | |
67 ## only threads for file based patterns | |
68 if ($type eq 'file') { | |
69 $grepqueue = Thread::Queue->new(); | |
70 $printqueue = Thread::Queue->new(); | |
71 for ($i = 1; $i <= $nrgrep; $i++) { | |
72 ${"grep$i"} = threads->create('grep'); | |
73 } | |
74 $print = threads->create('printout'); | |
75 } | |
76 | |
77 ## single pattern => direct processing | |
78 if ($type eq 'single') { | |
79 $patt = $opts{'f'}; | |
80 system("grep $args '$patt' $infile > $tmpdir/result.txt"); | |
81 | |
82 } | |
83 elsif ($type eq 'file') { | |
84 $pattfile = $opts{'f'}; | |
85 ## make sure patterns are unique | |
86 open IN, "$pattfile"; | |
87 my %pats; | |
88 while (<IN>) { | |
89 chomp($_); | |
90 $pats{$_} = 1; | |
91 } | |
92 close IN; | |
93 ## copy infile to local system for speed | |
94 system("cp '$infile' '$tmpdir/infile'"); | |
95 mkdir("$tmpdir/pattfiles"); | |
96 ## run in batches of 100 patterns. | |
97 my @patterns = keys(%pats); | |
98 $idx = 0; | |
99 while (my @subset = splice(@patterns,0,100)) { | |
100 if (scalar(@subset) == 0) { | |
101 last; | |
102 } | |
103 $idx++; | |
104 open OUT, ">$tmpdir/pattfiles/$idx"; | |
105 print OUT join("\n",@subset); | |
106 close OUT; | |
107 $grepqueue->enqueue($idx); | |
108 } | |
109 for ($i = 1; $i <= $nrgrep; $i++) { | |
110 $grepqueue->enqueue(undef); | |
111 } | |
112 } | |
113 else { | |
114 die('only "file" and "single" are supported as value of the -t flag'); | |
115 } | |
116 | |
117 for ($i = 1; $i<= $nrgrep ; $i++) { | |
118 ${"grep$i"}->join(); | |
119 } | |
120 $printqueue->enqueue(undef); | |
121 $print->join(); | |
122 | |
123 system("cp $tmpdir/result.txt '$outfile'"); | |
124 | |
125 system("rm -Rf $tmpdir"); | |
126 | |
127 ################## | |
128 # PRINT RUN-TIME # | |
129 ################## | |
130 $now = time - $now; | |
131 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); | |
132 | |
133 sub grep { | |
134 #local copies | |
135 my $in = $infile; | |
136 my $largs = $args; | |
137 my $ltmp = $tmpdir; | |
138 while (defined(my $idx = $grepqueue->dequeue())) { | |
139 $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'"; | |
140 my $out = `$command`; | |
141 $printqueue->enqueue($out); | |
142 } | |
143 } | |
144 | |
145 sub printout { | |
146 $counter = 0; | |
147 $output = ''; | |
148 while (defined(my $result = $printqueue->dequeue())) { | |
149 if ($result ne '' && $result ne "\n") { | |
150 $output .= $result; | |
151 $counter++; | |
152 } | |
153 if ($counter > 50) { | |
154 open OUT, ">>$tmpdir/result.txt"; | |
155 print OUT $output; | |
156 close OUT; | |
157 $output = ''; | |
158 $counter = 0; | |
159 } | |
160 } | |
161 if ($output ne '') { | |
162 open OUT, ">>$tmpdir/result.txt"; | |
163 print OUT $output; | |
164 close OUT; | |
165 } | |
166 } | |
167 |