Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison 2.4/lib/perl5/x86_64-linux-gnu-thread-multi/String/Approx.pm @ 18:1163c16cb3c0 draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Mon, 02 Jun 2014 07:35:53 -0400 |
parents | e3609c8714fb |
children |
comparison
equal
deleted
inserted
replaced
17:5343ef57827f | 18:1163c16cb3c0 |
---|---|
1 package String::Approx; | |
2 | |
3 require v5.8.0; | |
4 | |
5 $VERSION = '3.27'; | |
6 | |
7 use strict; | |
8 local $^W = 1; | |
9 | |
10 use Carp; | |
11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); | |
12 | |
13 require Exporter; | |
14 require DynaLoader; | |
15 | |
16 @ISA = qw(Exporter DynaLoader); | |
17 | |
18 @EXPORT_OK = qw(amatch asubstitute aindex aslice arindex | |
19 adist adistr adistword adistrword); | |
20 | |
21 bootstrap String::Approx $VERSION; | |
22 | |
23 my $CACHE_MAX = 1000; # high water mark | |
24 my $CACHE_PURGE = 0.75; # purge this much of the least used | |
25 my $CACHE_N_PURGE; # purge this many of the least used | |
26 | |
27 sub cache_n_purge () { | |
28 $CACHE_N_PURGE = $CACHE_MAX * $CACHE_PURGE; | |
29 $CACHE_N_PURGE = 1 if $CACHE_N_PURGE < 1; | |
30 return $CACHE_N_PURGE; | |
31 } | |
32 | |
33 cache_n_purge(); | |
34 | |
35 sub cache_max (;$) { | |
36 if (@_ == 0) { | |
37 return $CACHE_MAX; | |
38 } else { | |
39 $CACHE_MAX = shift; | |
40 } | |
41 $CACHE_MAX = 0 if $CACHE_MAX < 0; | |
42 cache_n_purge(); | |
43 } | |
44 | |
45 sub cache_purge (;$) { | |
46 if (@_ == 0) { | |
47 return $CACHE_PURGE; | |
48 } else { | |
49 $CACHE_PURGE = shift; | |
50 } | |
51 if ($CACHE_PURGE < 0) { | |
52 $CACHE_PURGE = 0; | |
53 } elsif ($CACHE_PURGE > 1) { | |
54 $CACHE_PURGE = 1; | |
55 } | |
56 cache_n_purge(); | |
57 } | |
58 | |
59 my %_simple; | |
60 my %_simple_usage_count; | |
61 | |
62 sub _cf_simple { | |
63 my $P = shift; | |
64 | |
65 my @usage = | |
66 sort { $_simple_usage_count{$a} <=> $_simple_usage_count{$b} } | |
67 grep { $_ ne $P } | |
68 keys %_simple_usage_count; | |
69 | |
70 # Make room, delete the least used entries. | |
71 $#usage = $CACHE_N_PURGE - 1; | |
72 | |
73 delete @_simple_usage_count{@usage}; | |
74 delete @_simple{@usage}; | |
75 } | |
76 | |
77 sub _simple { | |
78 my $P = shift; | |
79 | |
80 my $_simple = new(__PACKAGE__, $P); | |
81 | |
82 if ($CACHE_MAX) { | |
83 $_simple{$P} = $_simple unless exists $_simple{$P}; | |
84 | |
85 $_simple_usage_count{$P}++; | |
86 | |
87 if (keys %_simple_usage_count > $CACHE_MAX) { | |
88 _cf_simple($P); | |
89 } | |
90 } | |
91 | |
92 return ( $_simple ); | |
93 } | |
94 | |
95 sub _parse_param { | |
96 use integer; | |
97 | |
98 my ($n, @param) = @_; | |
99 my %param; | |
100 | |
101 foreach (@param) { | |
102 while ($_ ne '') { | |
103 s/^\s+//; | |
104 if (s/^([IDS]\s*)?(\d+)(\s*%)?//) { | |
105 my $k = defined $3 ? (($2-1) * $n) / 100 + ($2 ? 1 : 0) : $2; | |
106 | |
107 if (defined $1) { | |
108 $param{$1} = $k; | |
109 } else { | |
110 $param{k} = $k; | |
111 } | |
112 } elsif (s/^initial_position\W+(\d+)\b//) { | |
113 $param{'initial_position'} = $1; | |
114 } elsif (s/^final_position\W+(\d+)\b//) { | |
115 $param{'final_position'} = $1; | |
116 } elsif (s/^position_range\W+(\d+)\b//) { | |
117 $param{'position_range'} = $1; | |
118 } elsif (s/^minimal_distance\b//) { | |
119 $param{'minimal_distance'} = 1; | |
120 } elsif (s/^i//) { | |
121 $param{ i } = 1; | |
122 } elsif (s/^g//) { | |
123 $param{ g } = 1; | |
124 } elsif (s/^\?//) { | |
125 $param{'?'} = 1; | |
126 } else { | |
127 warn "unknown parameter: '$_'\n"; | |
128 return; | |
129 } | |
130 } | |
131 } | |
132 | |
133 return %param; | |
134 } | |
135 | |
136 my %_param_key; | |
137 my %_parsed_param; | |
138 | |
139 my %_complex; | |
140 my %_complex_usage_count; | |
141 | |
142 sub _cf_complex { | |
143 my $P = shift; | |
144 | |
145 my @usage = | |
146 sort { $_complex_usage_count{$a} <=> | |
147 $_complex_usage_count{$b} } | |
148 grep { $_ ne $P } | |
149 keys %_complex_usage_count; | |
150 | |
151 # Make room, delete the least used entries. | |
152 $#usage = $CACHE_N_PURGE - 1; | |
153 | |
154 delete @_complex_usage_count{@usage}; | |
155 delete @_complex{@usage}; | |
156 } | |
157 | |
158 sub _complex { | |
159 my ($P, @param) = @_; | |
160 unshift @param, length $P; | |
161 my $param = "@param"; | |
162 my $_param_key; | |
163 my %param; | |
164 my $complex; | |
165 my $is_new; | |
166 | |
167 unless (exists $_param_key{$param}) { | |
168 %param = _parse_param(@param); | |
169 $_parsed_param{$param} = { %param }; | |
170 $_param_key{$param} = join(" ", %param); | |
171 } else { | |
172 %param = %{ $_parsed_param{$param} }; | |
173 } | |
174 | |
175 $_param_key = $_param_key{$param}; | |
176 | |
177 if ($CACHE_MAX) { | |
178 if (exists $_complex{$P}->{$_param_key}) { | |
179 $complex = $_complex{$P}->{$_param_key}; | |
180 } | |
181 } | |
182 | |
183 unless (defined $complex) { | |
184 if (exists $param{'k'}) { | |
185 $complex = new(__PACKAGE__, $P, $param{k}); | |
186 } else { | |
187 $complex = new(__PACKAGE__, $P); | |
188 } | |
189 $_complex{$P}->{$_param_key} = $complex if $CACHE_MAX; | |
190 $is_new = 1; | |
191 } | |
192 | |
193 if ($is_new) { | |
194 $complex->set_greedy unless exists $param{'?'}; | |
195 | |
196 $complex->set_insertions($param{'I'}) | |
197 if exists $param{'I'}; | |
198 $complex->set_deletions($param{'D'}) | |
199 if exists $param{'D'}; | |
200 $complex->set_substitutions($param{'S'}) | |
201 if exists $param{'S'}; | |
202 | |
203 $complex->set_caseignore_slice | |
204 if exists $param{'i'}; | |
205 | |
206 $complex->set_text_initial_position($param{'initial_position'}) | |
207 if exists $param{'initial_position'}; | |
208 | |
209 $complex->set_text_final_position($param{'final_position'}) | |
210 if exists $param{'final_position'}; | |
211 | |
212 $complex->set_text_position_range($param{'position_range'}) | |
213 if exists $param{'position_range'}; | |
214 | |
215 $complex->set_minimal_distance($param{'minimal_distance'}) | |
216 if exists $param{'minimal_distance'}; | |
217 } | |
218 | |
219 if ($CACHE_MAX) { | |
220 $_complex_usage_count{$P}->{$_param_key}++; | |
221 | |
222 # If our cache overfloweth. | |
223 if (scalar keys %_complex_usage_count > $CACHE_MAX) { | |
224 _cf_complex($P); | |
225 } | |
226 } | |
227 | |
228 return ( $complex, %param ); | |
229 } | |
230 | |
231 sub cache_disable { | |
232 cache_max(0); | |
233 } | |
234 | |
235 sub cache_flush_all { | |
236 my $old_purge = cache_purge(); | |
237 cache_purge(1); | |
238 _cf_simple(''); | |
239 _cf_complex(''); | |
240 cache_purge($old_purge); | |
241 } | |
242 | |
243 sub amatch { | |
244 my $P = shift; | |
245 return 1 unless length $P; | |
246 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
247 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
248 | |
249 if (@_) { | |
250 if (wantarray) { | |
251 return grep { $a->match($_) } @_; | |
252 } else { | |
253 foreach (@_) { | |
254 return 1 if $a->match($_); | |
255 } | |
256 return 0; | |
257 } | |
258 } | |
259 if (defined $_) { | |
260 if (wantarray) { | |
261 return $a->match($_) ? $_ : undef; | |
262 } else { | |
263 return 1 if $a->match($_); | |
264 } | |
265 } | |
266 return $a->match($_) if defined $_; | |
267 | |
268 warn "amatch: \$_ is undefined: what are you matching?\n"; | |
269 return; | |
270 } | |
271 | |
272 sub _find_substitute { | |
273 my ($ri, $rs, $i, $s, $S, $rn) = @_; | |
274 | |
275 push @{ $ri }, $i; | |
276 push @{ $rs }, $s; | |
277 | |
278 my $pre = substr($_, 0, $i); | |
279 my $old = substr($_, $i, $s); | |
280 my $suf = substr($_, $i + $s); | |
281 my $new = $S; | |
282 | |
283 $new =~ s/\$\`/$pre/g; | |
284 $new =~ s/\$\&/$old/g; | |
285 $new =~ s/\$\'/$suf/g; | |
286 | |
287 push @{ $rn }, $new; | |
288 } | |
289 | |
290 sub _do_substitute { | |
291 my ($rn, $ri, $rs, $rS) = @_; | |
292 | |
293 my $d = 0; | |
294 my $n = $_; | |
295 | |
296 foreach my $i (0..$#$rn) { | |
297 substr($n, $ri->[$i] + $d, $rs->[$i]) = $rn->[$i]; | |
298 $d += length($rn->[$i]) - $rs->[$i]; | |
299 } | |
300 | |
301 push @{ $rS }, $n; | |
302 } | |
303 | |
304 sub asubstitute { | |
305 my $P = shift; | |
306 my $S = shift; | |
307 my ($a, %p) = | |
308 (@_ && ref $_[0] eq 'ARRAY') ? | |
309 _complex($P, @{ shift(@_) }) : _simple($P); | |
310 | |
311 my ($i, $s, @i, @s, @n, @S); | |
312 | |
313 if (@_) { | |
314 if (exists $p{ g }) { | |
315 foreach (@_) { | |
316 @s = @i = @n = (); | |
317 while (($i, $s) = $a->slice_next($_)) { | |
318 if (defined $i) { | |
319 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
320 } | |
321 } | |
322 _do_substitute(\@n, \@i, \@s, \@S) if @n; | |
323 } | |
324 } else { | |
325 foreach (@_) { | |
326 @s = @i = @n = (); | |
327 ($i, $s) = $a->slice($_); | |
328 if (defined $i) { | |
329 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
330 _do_substitute(\@n, \@i, \@s, \@S); | |
331 } | |
332 } | |
333 } | |
334 return @S; | |
335 } elsif (defined $_) { | |
336 if (exists $p{ g }) { | |
337 while (($i, $s) = $a->slice_next($_)) { | |
338 if (defined $i) { | |
339 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
340 } | |
341 } | |
342 _do_substitute(\@n, \@i, \@s, \@S) if @n; | |
343 } else { | |
344 ($i, $s) = $a->slice($_); | |
345 if (defined $i) { | |
346 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
347 _do_substitute(\@n, \@i, \@s, \@S); | |
348 } | |
349 } | |
350 return $_ = $n[0]; | |
351 } else { | |
352 warn "asubstitute: \$_ is undefined: what are you substituting?\n"; | |
353 return; | |
354 } | |
355 } | |
356 | |
357 sub aindex { | |
358 my $P = shift; | |
359 return 0 unless length $P; | |
360 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
361 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
362 | |
363 $a->set_greedy; # The *first* match, thank you. | |
364 | |
365 if (@_) { | |
366 if (wantarray) { | |
367 return map { $a->index($_) } @_; | |
368 } else { | |
369 return $a->index($_[0]); | |
370 } | |
371 } | |
372 return $a->index($_) if defined $_; | |
373 | |
374 warn "aindex: \$_ is undefined: what are you indexing?\n"; | |
375 return; | |
376 } | |
377 | |
378 sub aslice { | |
379 my $P = shift; | |
380 return (0, 0) unless length $P; | |
381 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
382 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
383 | |
384 $a->set_greedy; # The *first* match, thank you. | |
385 | |
386 if (@_) { | |
387 return map { [ $a->slice($_) ] } @_; | |
388 } | |
389 return $a->slice($_) if defined $_; | |
390 | |
391 warn "aslice: \$_ is undefined: what are you slicing?\n"; | |
392 return; | |
393 } | |
394 | |
395 sub _adist { | |
396 my $s0 = shift; | |
397 my $s1 = shift; | |
398 my ($aslice) = aslice($s0, ['minimal_distance', @_], $s1); | |
399 my ($index, $size, $distance) = @$aslice; | |
400 my ($l0, $l1) = map { length } ($s0, $s1); | |
401 return $l0 <= $l1 ? $distance : -$distance; | |
402 } | |
403 | |
404 sub adist { | |
405 my $a0 = shift; | |
406 my $a1 = shift; | |
407 if (length($a0) == 0) { | |
408 return length($a1); | |
409 } | |
410 if (length($a1) == 0) { | |
411 return length($a0); | |
412 } | |
413 my @m = ref $_[0] eq 'ARRAY' ? @{shift()} : (); | |
414 if (ref $a0 eq 'ARRAY') { | |
415 if (ref $a1 eq 'ARRAY') { | |
416 return [ map { adist($a0, $_, @m) } @{$a1} ]; | |
417 } else { | |
418 return [ map { _adist($_, $a1, @m) } @{$a0} ]; | |
419 } | |
420 } elsif (ref $a1 eq 'ARRAY') { | |
421 return [ map { _adist($a0, $_, @m) } @{$a1} ]; | |
422 } else { | |
423 if (wantarray) { | |
424 return map { _adist($a0, $_, @m) } ($a1, @_); | |
425 } else { | |
426 return _adist($a0, $a1, @m); | |
427 } | |
428 } | |
429 } | |
430 | |
431 sub adistr { | |
432 my $a0 = shift; | |
433 my $a1 = shift; | |
434 my @m = ref $_[0] eq 'ARRAY' ? shift : (); | |
435 if (ref $a0 eq 'ARRAY') { | |
436 if (ref $a1 eq 'ARRAY') { | |
437 my $l0 = length(); | |
438 return $l0 ? [ map { adist($a0, $_, @m) } | |
439 @{$a1} ] : | |
440 [ ]; | |
441 } else { | |
442 return [ map { my $l0 = length(); | |
443 $l0 ? _adist($_, $a1, @m) / $l0 : undef | |
444 } @{$a0} ]; | |
445 } | |
446 } elsif (ref $a1 eq 'ARRAY') { | |
447 my $l0 = length($a0); | |
448 return [] unless $l0; | |
449 return [ map { _adist($a0, $_, @m) / $l0 } @{$a1} ]; | |
450 } else { | |
451 my $l0 = length($a0); | |
452 if (wantarray) { | |
453 return map { $l0 ? _adist($a0, $_, @m) / $l0 : undef } ($a1, @_); | |
454 } else { | |
455 return undef unless $l0; | |
456 return _adist($a0, $a1, @m) / $l0; | |
457 } | |
458 } | |
459 } | |
460 | |
461 sub adistword { | |
462 return adist($_[0], $_[1], ['position_range=0']); | |
463 } | |
464 | |
465 sub adistrword { | |
466 return adistr($_[0], $_[1], ['position_range=0']); | |
467 } | |
468 | |
469 sub arindex { | |
470 my $P = shift; | |
471 my $l = length $P; | |
472 return 0 unless $l; | |
473 my $R = reverse $P; | |
474 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
475 _complex($R, @{ shift(@_) }) : _simple($R))[0]; | |
476 | |
477 $a->set_greedy; # The *first* match, thank you. | |
478 | |
479 if (@_) { | |
480 if (wantarray) { | |
481 return map { | |
482 my $aindex = $a->index(scalar reverse()); | |
483 $aindex == -1 ? $aindex : (length($_) - $aindex - $l); | |
484 } @_; | |
485 } else { | |
486 my $aindex = $a->index(scalar reverse $_[0]); | |
487 return $aindex == -1 ? $aindex : (length($_[0]) - $aindex - $l); | |
488 } | |
489 } | |
490 if (defined $_) { | |
491 my $aindex = $a->index(scalar reverse()); | |
492 return $aindex == -1 ? $aindex : (length($_) - $aindex - $l); | |
493 } | |
494 | |
495 warn "arindex: \$_ is undefined: what are you indexing?\n"; | |
496 return; | |
497 } | |
498 | |
499 1; | |
500 __END__ | |
501 =pod | |
502 | |
503 =head1 NAME | |
504 | |
505 String::Approx - Perl extension for approximate matching (fuzzy matching) | |
506 | |
507 =head1 SYNOPSIS | |
508 | |
509 use String::Approx 'amatch'; | |
510 | |
511 print if amatch("foobar"); | |
512 | |
513 my @matches = amatch("xyzzy", @inputs); | |
514 | |
515 my @catches = amatch("plugh", ['2'], @inputs); | |
516 | |
517 =head1 DESCRIPTION | |
518 | |
519 String::Approx lets you match and substitute strings approximately. | |
520 With this you can emulate errors: typing errorrs, speling errors, | |
521 closely related vocabularies (colour color), genetic mutations (GAG | |
522 ACT), abbreviations (McScot, MacScot). | |
523 | |
524 NOTE: String::Approx suits the task of B<string matching>, not | |
525 B<string comparison>, and it works for B<strings>, not for B<text>. | |
526 | |
527 If you want to compare strings for similarity, you probably just want | |
528 the Levenshtein edit distance (explained below), the Text::Levenshtein | |
529 and Text::LevenshteinXS modules in CPAN. See also Text::WagnerFischer | |
530 and Text::PhraseDistance. (There are functions for this in String::Approx, | |
531 e.g. adist(), but their results sometimes differ from the bare Levenshtein | |
532 et al.) | |
533 | |
534 If you want to compare things like text or source code, consisting of | |
535 B<words> or B<tokens> and B<phrases> and B<sentences>, or | |
536 B<expressions> and B<statements>, you should probably use some other | |
537 tool than String::Approx, like for example the standard UNIX diff(1) | |
538 tool, or the Algorithm::Diff module from CPAN. | |
539 | |
540 The measure of B<approximateness> is the I<Levenshtein edit distance>. | |
541 It is the total number of "edits": insertions, | |
542 | |
543 word world | |
544 | |
545 deletions, | |
546 | |
547 monkey money | |
548 | |
549 and substitutions | |
550 | |
551 sun fun | |
552 | |
553 required to transform a string to another string. For example, to | |
554 transform I<"lead"> into I<"gold">, you need three edits: | |
555 | |
556 lead gead goad gold | |
557 | |
558 The edit distance of "lead" and "gold" is therefore three, or 75%. | |
559 | |
560 B<String::Approx> uses the Levenshtein edit distance as its measure, but | |
561 String::Approx is not well-suited for comparing strings of different | |
562 length, in other words, if you want a "fuzzy eq", see above. | |
563 String::Approx is more like regular expressions or index(), it finds | |
564 substrings that are close matches.> | |
565 | |
566 =head1 MATCH | |
567 | |
568 use String::Approx 'amatch'; | |
569 | |
570 $matched = amatch("pattern") | |
571 $matched = amatch("pattern", [ modifiers ]) | |
572 | |
573 $any_matched = amatch("pattern", @inputs) | |
574 $any_matched = amatch("pattern", [ modifiers ], @inputs) | |
575 | |
576 @match = amatch("pattern") | |
577 @match = amatch("pattern", [ modifiers ]) | |
578 | |
579 @matches = amatch("pattern", @inputs) | |
580 @matches = amatch("pattern", [ modifiers ], @inputs) | |
581 | |
582 Match B<pattern> approximately. In list context return the matched | |
583 B<@inputs>. If no inputs are given, match against the B<$_>. In scalar | |
584 context return true if I<any> of the inputs match, false if none match. | |
585 | |
586 Notice that the pattern is a string. Not a regular expression. None | |
587 of the regular expression notations (^, ., *, and so on) work. They | |
588 are characters just like the others. Note-on-note: some limited form | |
589 of I<"regular expressionism"> is planned in future: for example | |
590 character classes ([abc]) and I<any-chars> (.). But that feature will | |
591 be turned on by a special I<modifier> (just a guess: "r"), so there | |
592 should be no backward compatibility problem. | |
593 | |
594 Notice also that matching is not symmetric. The inputs are matched | |
595 against the pattern, not the other way round. In other words: the | |
596 pattern can be a substring, a submatch, of an input element. An input | |
597 element is always a superstring of the pattern. | |
598 | |
599 =head2 MODIFIERS | |
600 | |
601 With the modifiers you can control the amount of approximateness and | |
602 certain other control variables. The modifiers are one or more | |
603 strings, for example B<"i">, within a string optionally separated by | |
604 whitespace. The modifiers are inside an anonymous array: the B<[ ]> | |
605 in the syntax are not notational, they really do mean B<[ ]>, for | |
606 example B<[ "i", "2" ]>. B<["2 i"]> would be identical. | |
607 | |
608 The implicit default approximateness is 10%, rounded up. In other | |
609 words: every tenth character in the pattern may be an error, an edit. | |
610 You can explicitly set the maximum approximateness by supplying a | |
611 modifier like | |
612 | |
613 number | |
614 number% | |
615 | |
616 Examples: B<"3">, B<"15%">. | |
617 | |
618 Note that C<0%> is not rounded up, it is equal to C<0>. | |
619 | |
620 Using a similar syntax you can separately control the maximum number | |
621 of insertions, deletions, and substitutions by prefixing the numbers | |
622 with I, D, or S, like this: | |
623 | |
624 Inumber | |
625 Inumber% | |
626 Dnumber | |
627 Dnumber% | |
628 Snumber | |
629 Snumber% | |
630 | |
631 Examples: B<"I2">, B<"D20%">, B<"S0">. | |
632 | |
633 You can ignore case (B<"A"> becames equal to B<"a"> and vice versa) | |
634 by adding the B<"i"> modifier. | |
635 | |
636 For example | |
637 | |
638 [ "i 25%", "S0" ] | |
639 | |
640 means I<ignore case>, I<allow every fourth character to be "an edit">, | |
641 but allow I<no substitutions>. (See L<NOTES> about disallowing | |
642 substitutions or insertions.) | |
643 | |
644 NOTE: setting C<I0 D0 S0> is not equivalent to using index(). | |
645 If you want to use index(), use index(). | |
646 | |
647 =head1 SUBSTITUTE | |
648 | |
649 use String::Approx 'asubstitute'; | |
650 | |
651 @substituted = asubstitute("pattern", "replacement") | |
652 @substituted = asubstitute("pattern", "replacement", @inputs) | |
653 @substituted = asubstitute("pattern", "replacement", [ modifiers ]) | |
654 @substituted = asubstitute("pattern", "replacement", | |
655 [ modifiers ], @inputs) | |
656 | |
657 Substitute approximate B<pattern> with B<replacement> and return as a | |
658 list <copies> of B<@inputs>, the substitutions having been made on the | |
659 elements that did match the pattern. If no inputs are given, | |
660 substitute in the B<$_>. The replacement can contain magic strings | |
661 B<$&>, B<$`>, B<$'> that stand for the matched string, the string | |
662 before it, and the string after it, respectively. All the other | |
663 arguments are as in C<amatch()>, plus one additional modifier, B<"g"> | |
664 which means substitute globally (all the matches in an element and not | |
665 just the first one, as is the default). | |
666 | |
667 See L<BAD NEWS> about the unfortunate stinginess of C<asubstitute()>. | |
668 | |
669 =head1 INDEX | |
670 | |
671 use String::Approx 'aindex'; | |
672 | |
673 $index = aindex("pattern") | |
674 @indices = aindex("pattern", @inputs) | |
675 $index = aindex("pattern", [ modifiers ]) | |
676 @indices = aindex("pattern", [ modifiers ], @inputs) | |
677 | |
678 Like C<amatch()> but returns the index/indices at which the pattern | |
679 matches approximately. In list context and if C<@inputs> are used, | |
680 returns a list of indices, one index for each input element. | |
681 If there's no approximate match, C<-1> is returned as the index. | |
682 | |
683 NOTE: if there is character repetition (e.g. "aa") either in | |
684 the pattern or in the text, the returned index might start | |
685 "too early". This is consistent with the goal of the module | |
686 of matching "as early as possible", just like regular expressions | |
687 (that there might be a "less approximate" match starting later is | |
688 of somewhat irrelevant). | |
689 | |
690 There's also backwards-scanning C<arindex()>. | |
691 | |
692 =head1 SLICE | |
693 | |
694 use String::Approx 'aslice'; | |
695 | |
696 ($index, $size) = aslice("pattern") | |
697 ([$i0, $s0], ...) = aslice("pattern", @inputs) | |
698 ($index, $size) = aslice("pattern", [ modifiers ]) | |
699 ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
700 | |
701 Like C<aindex()> but returns also the size (length) of the match. | |
702 If the match fails, returns an empty list (when matching against C<$_>) | |
703 or an empty anonymous list corresponding to the particular input. | |
704 | |
705 NOTE: size of the match will very probably be something you did not | |
706 expect (such as longer than the pattern, or a negative number). This | |
707 may or may not be fixed in future releases. Also the beginning of the | |
708 match may vary from the expected as with aindex(), see above. | |
709 | |
710 If the modifier | |
711 | |
712 "minimal_distance" | |
713 | |
714 is used, the minimal possible edit distance is returned as the | |
715 third element: | |
716 | |
717 ($index, $size, $distance) = aslice("pattern", [ modifiers ]) | |
718 ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
719 | |
720 =head1 DISTANCE | |
721 | |
722 use String::Approx 'adist'; | |
723 | |
724 $dist = adist("pattern", $input); | |
725 @dist = adist("pattern", @input); | |
726 | |
727 Return the I<edit distance> or distances between the pattern and the | |
728 input or inputs. Zero edit distance means exact match. (Remember | |
729 that the match can 'float' in the inputs, the match is a substring | |
730 match.) If the pattern is longer than the input or inputs, the | |
731 returned distance or distances is or are negative. | |
732 | |
733 use String::Approx 'adistr'; | |
734 | |
735 $dist = adistr("pattern", $input); | |
736 @dist = adistr("pattern", @inputs); | |
737 | |
738 Return the B<relative> I<edit distance> or distances between the | |
739 pattern and the input or inputs. Zero relative edit distance means | |
740 exact match, one means completely different. (Remember that the | |
741 match can 'float' in the inputs, the match is a substring match.) If | |
742 the pattern is longer than the input or inputs, the returned distance | |
743 or distances is or are negative. | |
744 | |
745 You can use adist() or adistr() to sort the inputs according to their | |
746 approximateness: | |
747 | |
748 my %d; | |
749 @d{@inputs} = map { abs } adistr("pattern", @inputs); | |
750 my @d = sort { $d{$a} <=> $d{$b} } @inputs; | |
751 | |
752 Now C<@d> contains the inputs, the most like C<"pattern"> first. | |
753 | |
754 =head1 CONTROLLING THE CACHE | |
755 | |
756 C<String::Approx> maintains a LU (least-used) cache that holds the | |
757 'matching engines' for each instance of a I<pattern+modifiers>. The | |
758 cache is intended to help the case where you match a small set of | |
759 patterns against a large set of string. However, the more engines you | |
760 cache the more you eat memory. If you have a lot of different | |
761 patterns or if you have a lot of memory to burn, you may want to | |
762 control the cache yourself. For example, allowing a larger cache | |
763 consumes more memory but probably runs a little bit faster since the | |
764 cache fills (and needs flushing) less often. | |
765 | |
766 The cache has two parameters: I<max> and I<purge>. The first one | |
767 is the maximum size of the cache and the second one is the cache | |
768 flushing ratio: when the number of cache entries exceeds I<max>, | |
769 I<max> times I<purge> cache entries are flushed. The default | |
770 values are 1000 and 0.75, respectively, which means that when | |
771 the 1001st entry would be cached, 750 least used entries will | |
772 be removed from the cache. To access the parameters you can | |
773 use the calls | |
774 | |
775 $now_max = String::Approx::cache_max(); | |
776 String::Approx::cache_max($new_max); | |
777 | |
778 $now_purge = String::Approx::cache_purge(); | |
779 String::Approx::cache_purge($new_purge); | |
780 | |
781 $limit = String::Approx::cache_n_purge(); | |
782 | |
783 To be honest, there are actually B<two> caches: the first one is used | |
784 far the patterns with no modifiers, the second one for the patterns | |
785 with pattern modifiers. Using the standard parameters you will | |
786 therefore actually cache up to 2000 entries. The above calls control | |
787 both caches for the same price. | |
788 | |
789 To disable caching completely use | |
790 | |
791 String::Approx::cache_disable(); | |
792 | |
793 Note that this doesn't flush any possibly existing cache entries, | |
794 to do that use | |
795 | |
796 String::Approx::cache_flush_all(); | |
797 | |
798 =head1 NOTES | |
799 | |
800 Because matching is by I<substrings>, not by whole strings, insertions | |
801 and substitutions produce often very similar results: "abcde" matches | |
802 "axbcde" either by insertion B<or> substitution of "x". | |
803 | |
804 The maximum edit distance is also the maximum number of edits. | |
805 That is, the B<"I2"> in | |
806 | |
807 amatch("abcd", ["I2"]) | |
808 | |
809 is useless because the maximum edit distance is (implicitly) 1. | |
810 You may have meant to say | |
811 | |
812 amatch("abcd", ["2D1S1"]) | |
813 | |
814 or something like that. | |
815 | |
816 If you want to simulate transposes | |
817 | |
818 feet fete | |
819 | |
820 you need to allow at least edit distance of two because in terms of | |
821 our edit primitives a transpose is first one deletion and then one | |
822 insertion. | |
823 | |
824 =head2 TEXT POSITION | |
825 | |
826 The starting and ending positions of matching, substituting, indexing, or | |
827 slicing can be changed from the beginning and end of the input(s) to | |
828 some other positions by using either or both of the modifiers | |
829 | |
830 "initial_position=24" | |
831 "final_position=42" | |
832 | |
833 or the both the modifiers | |
834 | |
835 "initial_position=24" | |
836 "position_range=10" | |
837 | |
838 By setting the B<"position_range"> to be zero you can limit | |
839 (anchor) the operation to happen only once (if a match is possible) | |
840 at the position. | |
841 | |
842 =head1 VERSION | |
843 | |
844 Major release 3. | |
845 | |
846 =head1 CHANGES FROM VERSION 2 | |
847 | |
848 =head2 GOOD NEWS | |
849 | |
850 =over 4 | |
851 | |
852 =item The version 3 is 2-3 times faster than version 2 | |
853 | |
854 =item No pattern length limitation | |
855 | |
856 The algorithm is independent on the pattern length: its time | |
857 complexity is I<O(kn)>, where I<k> is the number of edits and I<n> the | |
858 length of the text (input). The preprocessing of the pattern will of | |
859 course take some I<O(m)> (I<m> being the pattern length) time, but | |
860 C<amatch()> and C<asubstitute()> cache the result of this | |
861 preprocessing so that it is done only once per pattern. | |
862 | |
863 =back | |
864 | |
865 =head2 BAD NEWS | |
866 | |
867 =over 4 | |
868 | |
869 =item You do need a C compiler to install the module | |
870 | |
871 Perl's regular expressions are no more used; instead a faster and more | |
872 scalable algorithm written in C is used. | |
873 | |
874 =item C<asubstitute()> is now always stingy | |
875 | |
876 The string matched and substituted is now always stingy, as short | |
877 as possible. It used to be as long as possible. This is an unfortunate | |
878 change stemming from switching the matching algorithm. Example: with | |
879 edit distance of two and substituting for B<"word"> from B<"cork"> and | |
880 B<"wool"> previously did match B<"cork"> and B<"wool">. Now it does | |
881 match B<"or"> and B<"wo">. As little as possible, or, in other words, | |
882 with as much approximateness, as many edits, as possible. Because | |
883 there is no I<need> to match the B<"c"> of B<"cork">, it is not matched. | |
884 | |
885 =item no more C<aregex()> because regular expressions are no more used | |
886 | |
887 =item no more C<compat1> for String::Approx version 1 compatibility | |
888 | |
889 =back | |
890 | |
891 =head1 ACKNOWLEDGEMENTS | |
892 | |
893 The following people have provided valuable test cases, documentation | |
894 clarifications, and other feedback: | |
895 | |
896 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz, | |
897 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda, | |
898 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner, | |
899 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton, | |
900 'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley, | |
901 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman, | |
902 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park, | |
903 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic, | |
904 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin, | |
905 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith, | |
906 Frank Tobin, Greg Ward, Rich Williams, Rick Wise. | |
907 | |
908 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra | |
909 Gopal in the Department of Computer Science, University of Arizona. | |
910 | |
911 =head1 AUTHOR | |
912 | |
913 Jarkko Hietaniemi <jhi@iki.fi> | |
914 | |
915 =head1 COPYRIGHT AND LICENSE | |
916 | |
917 Copyright 2001-2013 by Jarkko Hietaniemi | |
918 | |
919 This library is free software; you can redistribute it and/or modify | |
920 under either the terms of the Artistic License 2.0, or the GNU Library | |
921 General Public License, Version 2. See the files Artistic and LGPL | |
922 for more details. | |
923 | |
924 Furthermore: no warranties or obligations of any kind are given, and | |
925 the separate file F<COPYRIGHT> must be included intact in all copies | |
926 and derived materials. | |
927 | |
928 =cut |