13
|
1 package String::Approx;
|
|
2
|
|
3 require v5.8.0;
|
|
4
|
|
5 $VERSION = '3.27';
|
|
6
|
|
7 use strict;
|
|
8 local $^W = 1;
|
|
9
|
|
10 use Carp;
|
|
11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
|
|
12
|
|
13 require Exporter;
|
|
14 require DynaLoader;
|
|
15
|
|
16 @ISA = qw(Exporter DynaLoader);
|
|
17
|
|
18 @EXPORT_OK = qw(amatch asubstitute aindex aslice arindex
|
|
19 adist adistr adistword adistrword);
|
|
20
|
|
21 bootstrap String::Approx $VERSION;
|
|
22
|
|
23 my $CACHE_MAX = 1000; # high water mark
|
|
24 my $CACHE_PURGE = 0.75; # purge this much of the least used
|
|
25 my $CACHE_N_PURGE; # purge this many of the least used
|
|
26
|
|
27 sub cache_n_purge () {
|
|
28 $CACHE_N_PURGE = $CACHE_MAX * $CACHE_PURGE;
|
|
29 $CACHE_N_PURGE = 1 if $CACHE_N_PURGE < 1;
|
|
30 return $CACHE_N_PURGE;
|
|
31 }
|
|
32
|
|
33 cache_n_purge();
|
|
34
|
|
35 sub cache_max (;$) {
|
|
36 if (@_ == 0) {
|
|
37 return $CACHE_MAX;
|
|
38 } else {
|
|
39 $CACHE_MAX = shift;
|
|
40 }
|
|
41 $CACHE_MAX = 0 if $CACHE_MAX < 0;
|
|
42 cache_n_purge();
|
|
43 }
|
|
44
|
|
45 sub cache_purge (;$) {
|
|
46 if (@_ == 0) {
|
|
47 return $CACHE_PURGE;
|
|
48 } else {
|
|
49 $CACHE_PURGE = shift;
|
|
50 }
|
|
51 if ($CACHE_PURGE < 0) {
|
|
52 $CACHE_PURGE = 0;
|
|
53 } elsif ($CACHE_PURGE > 1) {
|
|
54 $CACHE_PURGE = 1;
|
|
55 }
|
|
56 cache_n_purge();
|
|
57 }
|
|
58
|
|
59 my %_simple;
|
|
60 my %_simple_usage_count;
|
|
61
|
|
62 sub _cf_simple {
|
|
63 my $P = shift;
|
|
64
|
|
65 my @usage =
|
|
66 sort { $_simple_usage_count{$a} <=> $_simple_usage_count{$b} }
|
|
67 grep { $_ ne $P }
|
|
68 keys %_simple_usage_count;
|
|
69
|
|
70 # Make room, delete the least used entries.
|
|
71 $#usage = $CACHE_N_PURGE - 1;
|
|
72
|
|
73 delete @_simple_usage_count{@usage};
|
|
74 delete @_simple{@usage};
|
|
75 }
|
|
76
|
|
77 sub _simple {
|
|
78 my $P = shift;
|
|
79
|
|
80 my $_simple = new(__PACKAGE__, $P);
|
|
81
|
|
82 if ($CACHE_MAX) {
|
|
83 $_simple{$P} = $_simple unless exists $_simple{$P};
|
|
84
|
|
85 $_simple_usage_count{$P}++;
|
|
86
|
|
87 if (keys %_simple_usage_count > $CACHE_MAX) {
|
|
88 _cf_simple($P);
|
|
89 }
|
|
90 }
|
|
91
|
|
92 return ( $_simple );
|
|
93 }
|
|
94
|
|
95 sub _parse_param {
|
|
96 use integer;
|
|
97
|
|
98 my ($n, @param) = @_;
|
|
99 my %param;
|
|
100
|
|
101 foreach (@param) {
|
|
102 while ($_ ne '') {
|
|
103 s/^\s+//;
|
|
104 if (s/^([IDS]\s*)?(\d+)(\s*%)?//) {
|
|
105 my $k = defined $3 ? (($2-1) * $n) / 100 + ($2 ? 1 : 0) : $2;
|
|
106
|
|
107 if (defined $1) {
|
|
108 $param{$1} = $k;
|
|
109 } else {
|
|
110 $param{k} = $k;
|
|
111 }
|
|
112 } elsif (s/^initial_position\W+(\d+)\b//) {
|
|
113 $param{'initial_position'} = $1;
|
|
114 } elsif (s/^final_position\W+(\d+)\b//) {
|
|
115 $param{'final_position'} = $1;
|
|
116 } elsif (s/^position_range\W+(\d+)\b//) {
|
|
117 $param{'position_range'} = $1;
|
|
118 } elsif (s/^minimal_distance\b//) {
|
|
119 $param{'minimal_distance'} = 1;
|
|
120 } elsif (s/^i//) {
|
|
121 $param{ i } = 1;
|
|
122 } elsif (s/^g//) {
|
|
123 $param{ g } = 1;
|
|
124 } elsif (s/^\?//) {
|
|
125 $param{'?'} = 1;
|
|
126 } else {
|
|
127 warn "unknown parameter: '$_'\n";
|
|
128 return;
|
|
129 }
|
|
130 }
|
|
131 }
|
|
132
|
|
133 return %param;
|
|
134 }
|
|
135
|
|
136 my %_param_key;
|
|
137 my %_parsed_param;
|
|
138
|
|
139 my %_complex;
|
|
140 my %_complex_usage_count;
|
|
141
|
|
142 sub _cf_complex {
|
|
143 my $P = shift;
|
|
144
|
|
145 my @usage =
|
|
146 sort { $_complex_usage_count{$a} <=>
|
|
147 $_complex_usage_count{$b} }
|
|
148 grep { $_ ne $P }
|
|
149 keys %_complex_usage_count;
|
|
150
|
|
151 # Make room, delete the least used entries.
|
|
152 $#usage = $CACHE_N_PURGE - 1;
|
|
153
|
|
154 delete @_complex_usage_count{@usage};
|
|
155 delete @_complex{@usage};
|
|
156 }
|
|
157
|
|
158 sub _complex {
|
|
159 my ($P, @param) = @_;
|
|
160 unshift @param, length $P;
|
|
161 my $param = "@param";
|
|
162 my $_param_key;
|
|
163 my %param;
|
|
164 my $complex;
|
|
165 my $is_new;
|
|
166
|
|
167 unless (exists $_param_key{$param}) {
|
|
168 %param = _parse_param(@param);
|
|
169 $_parsed_param{$param} = { %param };
|
|
170 $_param_key{$param} = join(" ", %param);
|
|
171 } else {
|
|
172 %param = %{ $_parsed_param{$param} };
|
|
173 }
|
|
174
|
|
175 $_param_key = $_param_key{$param};
|
|
176
|
|
177 if ($CACHE_MAX) {
|
|
178 if (exists $_complex{$P}->{$_param_key}) {
|
|
179 $complex = $_complex{$P}->{$_param_key};
|
|
180 }
|
|
181 }
|
|
182
|
|
183 unless (defined $complex) {
|
|
184 if (exists $param{'k'}) {
|
|
185 $complex = new(__PACKAGE__, $P, $param{k});
|
|
186 } else {
|
|
187 $complex = new(__PACKAGE__, $P);
|
|
188 }
|
|
189 $_complex{$P}->{$_param_key} = $complex if $CACHE_MAX;
|
|
190 $is_new = 1;
|
|
191 }
|
|
192
|
|
193 if ($is_new) {
|
|
194 $complex->set_greedy unless exists $param{'?'};
|
|
195
|
|
196 $complex->set_insertions($param{'I'})
|
|
197 if exists $param{'I'};
|
|
198 $complex->set_deletions($param{'D'})
|
|
199 if exists $param{'D'};
|
|
200 $complex->set_substitutions($param{'S'})
|
|
201 if exists $param{'S'};
|
|
202
|
|
203 $complex->set_caseignore_slice
|
|
204 if exists $param{'i'};
|
|
205
|
|
206 $complex->set_text_initial_position($param{'initial_position'})
|
|
207 if exists $param{'initial_position'};
|
|
208
|
|
209 $complex->set_text_final_position($param{'final_position'})
|
|
210 if exists $param{'final_position'};
|
|
211
|
|
212 $complex->set_text_position_range($param{'position_range'})
|
|
213 if exists $param{'position_range'};
|
|
214
|
|
215 $complex->set_minimal_distance($param{'minimal_distance'})
|
|
216 if exists $param{'minimal_distance'};
|
|
217 }
|
|
218
|
|
219 if ($CACHE_MAX) {
|
|
220 $_complex_usage_count{$P}->{$_param_key}++;
|
|
221
|
|
222 # If our cache overfloweth.
|
|
223 if (scalar keys %_complex_usage_count > $CACHE_MAX) {
|
|
224 _cf_complex($P);
|
|
225 }
|
|
226 }
|
|
227
|
|
228 return ( $complex, %param );
|
|
229 }
|
|
230
|
|
231 sub cache_disable {
|
|
232 cache_max(0);
|
|
233 }
|
|
234
|
|
235 sub cache_flush_all {
|
|
236 my $old_purge = cache_purge();
|
|
237 cache_purge(1);
|
|
238 _cf_simple('');
|
|
239 _cf_complex('');
|
|
240 cache_purge($old_purge);
|
|
241 }
|
|
242
|
|
243 sub amatch {
|
|
244 my $P = shift;
|
|
245 return 1 unless length $P;
|
|
246 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
|
|
247 _complex($P, @{ shift(@_) }) : _simple($P))[0];
|
|
248
|
|
249 if (@_) {
|
|
250 if (wantarray) {
|
|
251 return grep { $a->match($_) } @_;
|
|
252 } else {
|
|
253 foreach (@_) {
|
|
254 return 1 if $a->match($_);
|
|
255 }
|
|
256 return 0;
|
|
257 }
|
|
258 }
|
|
259 if (defined $_) {
|
|
260 if (wantarray) {
|
|
261 return $a->match($_) ? $_ : undef;
|
|
262 } else {
|
|
263 return 1 if $a->match($_);
|
|
264 }
|
|
265 }
|
|
266 return $a->match($_) if defined $_;
|
|
267
|
|
268 warn "amatch: \$_ is undefined: what are you matching?\n";
|
|
269 return;
|
|
270 }
|
|
271
|
|
272 sub _find_substitute {
|
|
273 my ($ri, $rs, $i, $s, $S, $rn) = @_;
|
|
274
|
|
275 push @{ $ri }, $i;
|
|
276 push @{ $rs }, $s;
|
|
277
|
|
278 my $pre = substr($_, 0, $i);
|
|
279 my $old = substr($_, $i, $s);
|
|
280 my $suf = substr($_, $i + $s);
|
|
281 my $new = $S;
|
|
282
|
|
283 $new =~ s/\$\`/$pre/g;
|
|
284 $new =~ s/\$\&/$old/g;
|
|
285 $new =~ s/\$\'/$suf/g;
|
|
286
|
|
287 push @{ $rn }, $new;
|
|
288 }
|
|
289
|
|
290 sub _do_substitute {
|
|
291 my ($rn, $ri, $rs, $rS) = @_;
|
|
292
|
|
293 my $d = 0;
|
|
294 my $n = $_;
|
|
295
|
|
296 foreach my $i (0..$#$rn) {
|
|
297 substr($n, $ri->[$i] + $d, $rs->[$i]) = $rn->[$i];
|
|
298 $d += length($rn->[$i]) - $rs->[$i];
|
|
299 }
|
|
300
|
|
301 push @{ $rS }, $n;
|
|
302 }
|
|
303
|
|
304 sub asubstitute {
|
|
305 my $P = shift;
|
|
306 my $S = shift;
|
|
307 my ($a, %p) =
|
|
308 (@_ && ref $_[0] eq 'ARRAY') ?
|
|
309 _complex($P, @{ shift(@_) }) : _simple($P);
|
|
310
|
|
311 my ($i, $s, @i, @s, @n, @S);
|
|
312
|
|
313 if (@_) {
|
|
314 if (exists $p{ g }) {
|
|
315 foreach (@_) {
|
|
316 @s = @i = @n = ();
|
|
317 while (($i, $s) = $a->slice_next($_)) {
|
|
318 if (defined $i) {
|
|
319 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
|
|
320 }
|
|
321 }
|
|
322 _do_substitute(\@n, \@i, \@s, \@S) if @n;
|
|
323 }
|
|
324 } else {
|
|
325 foreach (@_) {
|
|
326 @s = @i = @n = ();
|
|
327 ($i, $s) = $a->slice($_);
|
|
328 if (defined $i) {
|
|
329 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
|
|
330 _do_substitute(\@n, \@i, \@s, \@S);
|
|
331 }
|
|
332 }
|
|
333 }
|
|
334 return @S;
|
|
335 } elsif (defined $_) {
|
|
336 if (exists $p{ g }) {
|
|
337 while (($i, $s) = $a->slice_next($_)) {
|
|
338 if (defined $i) {
|
|
339 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
|
|
340 }
|
|
341 }
|
|
342 _do_substitute(\@n, \@i, \@s, \@S) if @n;
|
|
343 } else {
|
|
344 ($i, $s) = $a->slice($_);
|
|
345 if (defined $i) {
|
|
346 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
|
|
347 _do_substitute(\@n, \@i, \@s, \@S);
|
|
348 }
|
|
349 }
|
|
350 return $_ = $n[0];
|
|
351 } else {
|
|
352 warn "asubstitute: \$_ is undefined: what are you substituting?\n";
|
|
353 return;
|
|
354 }
|
|
355 }
|
|
356
|
|
357 sub aindex {
|
|
358 my $P = shift;
|
|
359 return 0 unless length $P;
|
|
360 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
|
|
361 _complex($P, @{ shift(@_) }) : _simple($P))[0];
|
|
362
|
|
363 $a->set_greedy; # The *first* match, thank you.
|
|
364
|
|
365 if (@_) {
|
|
366 if (wantarray) {
|
|
367 return map { $a->index($_) } @_;
|
|
368 } else {
|
|
369 return $a->index($_[0]);
|
|
370 }
|
|
371 }
|
|
372 return $a->index($_) if defined $_;
|
|
373
|
|
374 warn "aindex: \$_ is undefined: what are you indexing?\n";
|
|
375 return;
|
|
376 }
|
|
377
|
|
378 sub aslice {
|
|
379 my $P = shift;
|
|
380 return (0, 0) unless length $P;
|
|
381 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
|
|
382 _complex($P, @{ shift(@_) }) : _simple($P))[0];
|
|
383
|
|
384 $a->set_greedy; # The *first* match, thank you.
|
|
385
|
|
386 if (@_) {
|
|
387 return map { [ $a->slice($_) ] } @_;
|
|
388 }
|
|
389 return $a->slice($_) if defined $_;
|
|
390
|
|
391 warn "aslice: \$_ is undefined: what are you slicing?\n";
|
|
392 return;
|
|
393 }
|
|
394
|
|
395 sub _adist {
|
|
396 my $s0 = shift;
|
|
397 my $s1 = shift;
|
|
398 my ($aslice) = aslice($s0, ['minimal_distance', @_], $s1);
|
|
399 my ($index, $size, $distance) = @$aslice;
|
|
400 my ($l0, $l1) = map { length } ($s0, $s1);
|
|
401 return $l0 <= $l1 ? $distance : -$distance;
|
|
402 }
|
|
403
|
|
404 sub adist {
|
|
405 my $a0 = shift;
|
|
406 my $a1 = shift;
|
|
407 if (length($a0) == 0) {
|
|
408 return length($a1);
|
|
409 }
|
|
410 if (length($a1) == 0) {
|
|
411 return length($a0);
|
|
412 }
|
|
413 my @m = ref $_[0] eq 'ARRAY' ? @{shift()} : ();
|
|
414 if (ref $a0 eq 'ARRAY') {
|
|
415 if (ref $a1 eq 'ARRAY') {
|
|
416 return [ map { adist($a0, $_, @m) } @{$a1} ];
|
|
417 } else {
|
|
418 return [ map { _adist($_, $a1, @m) } @{$a0} ];
|
|
419 }
|
|
420 } elsif (ref $a1 eq 'ARRAY') {
|
|
421 return [ map { _adist($a0, $_, @m) } @{$a1} ];
|
|
422 } else {
|
|
423 if (wantarray) {
|
|
424 return map { _adist($a0, $_, @m) } ($a1, @_);
|
|
425 } else {
|
|
426 return _adist($a0, $a1, @m);
|
|
427 }
|
|
428 }
|
|
429 }
|
|
430
|
|
431 sub adistr {
|
|
432 my $a0 = shift;
|
|
433 my $a1 = shift;
|
|
434 my @m = ref $_[0] eq 'ARRAY' ? shift : ();
|
|
435 if (ref $a0 eq 'ARRAY') {
|
|
436 if (ref $a1 eq 'ARRAY') {
|
|
437 my $l0 = length();
|
|
438 return $l0 ? [ map { adist($a0, $_, @m) }
|
|
439 @{$a1} ] :
|
|
440 [ ];
|
|
441 } else {
|
|
442 return [ map { my $l0 = length();
|
|
443 $l0 ? _adist($_, $a1, @m) / $l0 : undef
|
|
444 } @{$a0} ];
|
|
445 }
|
|
446 } elsif (ref $a1 eq 'ARRAY') {
|
|
447 my $l0 = length($a0);
|
|
448 return [] unless $l0;
|
|
449 return [ map { _adist($a0, $_, @m) / $l0 } @{$a1} ];
|
|
450 } else {
|
|
451 my $l0 = length($a0);
|
|
452 if (wantarray) {
|
|
453 return map { $l0 ? _adist($a0, $_, @m) / $l0 : undef } ($a1, @_);
|
|
454 } else {
|
|
455 return undef unless $l0;
|
|
456 return _adist($a0, $a1, @m) / $l0;
|
|
457 }
|
|
458 }
|
|
459 }
|
|
460
|
|
461 sub adistword {
|
|
462 return adist($_[0], $_[1], ['position_range=0']);
|
|
463 }
|
|
464
|
|
465 sub adistrword {
|
|
466 return adistr($_[0], $_[1], ['position_range=0']);
|
|
467 }
|
|
468
|
|
469 sub arindex {
|
|
470 my $P = shift;
|
|
471 my $l = length $P;
|
|
472 return 0 unless $l;
|
|
473 my $R = reverse $P;
|
|
474 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
|
|
475 _complex($R, @{ shift(@_) }) : _simple($R))[0];
|
|
476
|
|
477 $a->set_greedy; # The *first* match, thank you.
|
|
478
|
|
479 if (@_) {
|
|
480 if (wantarray) {
|
|
481 return map {
|
|
482 my $aindex = $a->index(scalar reverse());
|
|
483 $aindex == -1 ? $aindex : (length($_) - $aindex - $l);
|
|
484 } @_;
|
|
485 } else {
|
|
486 my $aindex = $a->index(scalar reverse $_[0]);
|
|
487 return $aindex == -1 ? $aindex : (length($_[0]) - $aindex - $l);
|
|
488 }
|
|
489 }
|
|
490 if (defined $_) {
|
|
491 my $aindex = $a->index(scalar reverse());
|
|
492 return $aindex == -1 ? $aindex : (length($_) - $aindex - $l);
|
|
493 }
|
|
494
|
|
495 warn "arindex: \$_ is undefined: what are you indexing?\n";
|
|
496 return;
|
|
497 }
|
|
498
|
|
499 1;
|
|
500 __END__
|
|
501 =pod
|
|
502
|
|
503 =head1 NAME
|
|
504
|
|
505 String::Approx - Perl extension for approximate matching (fuzzy matching)
|
|
506
|
|
507 =head1 SYNOPSIS
|
|
508
|
|
509 use String::Approx 'amatch';
|
|
510
|
|
511 print if amatch("foobar");
|
|
512
|
|
513 my @matches = amatch("xyzzy", @inputs);
|
|
514
|
|
515 my @catches = amatch("plugh", ['2'], @inputs);
|
|
516
|
|
517 =head1 DESCRIPTION
|
|
518
|
|
519 String::Approx lets you match and substitute strings approximately.
|
|
520 With this you can emulate errors: typing errorrs, speling errors,
|
|
521 closely related vocabularies (colour color), genetic mutations (GAG
|
|
522 ACT), abbreviations (McScot, MacScot).
|
|
523
|
|
524 NOTE: String::Approx suits the task of B<string matching>, not
|
|
525 B<string comparison>, and it works for B<strings>, not for B<text>.
|
|
526
|
|
527 If you want to compare strings for similarity, you probably just want
|
|
528 the Levenshtein edit distance (explained below), the Text::Levenshtein
|
|
529 and Text::LevenshteinXS modules in CPAN. See also Text::WagnerFischer
|
|
530 and Text::PhraseDistance. (There are functions for this in String::Approx,
|
|
531 e.g. adist(), but their results sometimes differ from the bare Levenshtein
|
|
532 et al.)
|
|
533
|
|
534 If you want to compare things like text or source code, consisting of
|
|
535 B<words> or B<tokens> and B<phrases> and B<sentences>, or
|
|
536 B<expressions> and B<statements>, you should probably use some other
|
|
537 tool than String::Approx, like for example the standard UNIX diff(1)
|
|
538 tool, or the Algorithm::Diff module from CPAN.
|
|
539
|
|
540 The measure of B<approximateness> is the I<Levenshtein edit distance>.
|
|
541 It is the total number of "edits": insertions,
|
|
542
|
|
543 word world
|
|
544
|
|
545 deletions,
|
|
546
|
|
547 monkey money
|
|
548
|
|
549 and substitutions
|
|
550
|
|
551 sun fun
|
|
552
|
|
553 required to transform a string to another string. For example, to
|
|
554 transform I<"lead"> into I<"gold">, you need three edits:
|
|
555
|
|
556 lead gead goad gold
|
|
557
|
|
558 The edit distance of "lead" and "gold" is therefore three, or 75%.
|
|
559
|
|
560 B<String::Approx> uses the Levenshtein edit distance as its measure, but
|
|
561 String::Approx is not well-suited for comparing strings of different
|
|
562 length, in other words, if you want a "fuzzy eq", see above.
|
|
563 String::Approx is more like regular expressions or index(), it finds
|
|
564 substrings that are close matches.>
|
|
565
|
|
566 =head1 MATCH
|
|
567
|
|
568 use String::Approx 'amatch';
|
|
569
|
|
570 $matched = amatch("pattern")
|
|
571 $matched = amatch("pattern", [ modifiers ])
|
|
572
|
|
573 $any_matched = amatch("pattern", @inputs)
|
|
574 $any_matched = amatch("pattern", [ modifiers ], @inputs)
|
|
575
|
|
576 @match = amatch("pattern")
|
|
577 @match = amatch("pattern", [ modifiers ])
|
|
578
|
|
579 @matches = amatch("pattern", @inputs)
|
|
580 @matches = amatch("pattern", [ modifiers ], @inputs)
|
|
581
|
|
582 Match B<pattern> approximately. In list context return the matched
|
|
583 B<@inputs>. If no inputs are given, match against the B<$_>. In scalar
|
|
584 context return true if I<any> of the inputs match, false if none match.
|
|
585
|
|
586 Notice that the pattern is a string. Not a regular expression. None
|
|
587 of the regular expression notations (^, ., *, and so on) work. They
|
|
588 are characters just like the others. Note-on-note: some limited form
|
|
589 of I<"regular expressionism"> is planned in future: for example
|
|
590 character classes ([abc]) and I<any-chars> (.). But that feature will
|
|
591 be turned on by a special I<modifier> (just a guess: "r"), so there
|
|
592 should be no backward compatibility problem.
|
|
593
|
|
594 Notice also that matching is not symmetric. The inputs are matched
|
|
595 against the pattern, not the other way round. In other words: the
|
|
596 pattern can be a substring, a submatch, of an input element. An input
|
|
597 element is always a superstring of the pattern.
|
|
598
|
|
599 =head2 MODIFIERS
|
|
600
|
|
601 With the modifiers you can control the amount of approximateness and
|
|
602 certain other control variables. The modifiers are one or more
|
|
603 strings, for example B<"i">, within a string optionally separated by
|
|
604 whitespace. The modifiers are inside an anonymous array: the B<[ ]>
|
|
605 in the syntax are not notational, they really do mean B<[ ]>, for
|
|
606 example B<[ "i", "2" ]>. B<["2 i"]> would be identical.
|
|
607
|
|
608 The implicit default approximateness is 10%, rounded up. In other
|
|
609 words: every tenth character in the pattern may be an error, an edit.
|
|
610 You can explicitly set the maximum approximateness by supplying a
|
|
611 modifier like
|
|
612
|
|
613 number
|
|
614 number%
|
|
615
|
|
616 Examples: B<"3">, B<"15%">.
|
|
617
|
|
618 Note that C<0%> is not rounded up, it is equal to C<0>.
|
|
619
|
|
620 Using a similar syntax you can separately control the maximum number
|
|
621 of insertions, deletions, and substitutions by prefixing the numbers
|
|
622 with I, D, or S, like this:
|
|
623
|
|
624 Inumber
|
|
625 Inumber%
|
|
626 Dnumber
|
|
627 Dnumber%
|
|
628 Snumber
|
|
629 Snumber%
|
|
630
|
|
631 Examples: B<"I2">, B<"D20%">, B<"S0">.
|
|
632
|
|
633 You can ignore case (B<"A"> becames equal to B<"a"> and vice versa)
|
|
634 by adding the B<"i"> modifier.
|
|
635
|
|
636 For example
|
|
637
|
|
638 [ "i 25%", "S0" ]
|
|
639
|
|
640 means I<ignore case>, I<allow every fourth character to be "an edit">,
|
|
641 but allow I<no substitutions>. (See L<NOTES> about disallowing
|
|
642 substitutions or insertions.)
|
|
643
|
|
644 NOTE: setting C<I0 D0 S0> is not equivalent to using index().
|
|
645 If you want to use index(), use index().
|
|
646
|
|
647 =head1 SUBSTITUTE
|
|
648
|
|
649 use String::Approx 'asubstitute';
|
|
650
|
|
651 @substituted = asubstitute("pattern", "replacement")
|
|
652 @substituted = asubstitute("pattern", "replacement", @inputs)
|
|
653 @substituted = asubstitute("pattern", "replacement", [ modifiers ])
|
|
654 @substituted = asubstitute("pattern", "replacement",
|
|
655 [ modifiers ], @inputs)
|
|
656
|
|
657 Substitute approximate B<pattern> with B<replacement> and return as a
|
|
658 list <copies> of B<@inputs>, the substitutions having been made on the
|
|
659 elements that did match the pattern. If no inputs are given,
|
|
660 substitute in the B<$_>. The replacement can contain magic strings
|
|
661 B<$&>, B<$`>, B<$'> that stand for the matched string, the string
|
|
662 before it, and the string after it, respectively. All the other
|
|
663 arguments are as in C<amatch()>, plus one additional modifier, B<"g">
|
|
664 which means substitute globally (all the matches in an element and not
|
|
665 just the first one, as is the default).
|
|
666
|
|
667 See L<BAD NEWS> about the unfortunate stinginess of C<asubstitute()>.
|
|
668
|
|
669 =head1 INDEX
|
|
670
|
|
671 use String::Approx 'aindex';
|
|
672
|
|
673 $index = aindex("pattern")
|
|
674 @indices = aindex("pattern", @inputs)
|
|
675 $index = aindex("pattern", [ modifiers ])
|
|
676 @indices = aindex("pattern", [ modifiers ], @inputs)
|
|
677
|
|
678 Like C<amatch()> but returns the index/indices at which the pattern
|
|
679 matches approximately. In list context and if C<@inputs> are used,
|
|
680 returns a list of indices, one index for each input element.
|
|
681 If there's no approximate match, C<-1> is returned as the index.
|
|
682
|
|
683 NOTE: if there is character repetition (e.g. "aa") either in
|
|
684 the pattern or in the text, the returned index might start
|
|
685 "too early". This is consistent with the goal of the module
|
|
686 of matching "as early as possible", just like regular expressions
|
|
687 (that there might be a "less approximate" match starting later is
|
|
688 of somewhat irrelevant).
|
|
689
|
|
690 There's also backwards-scanning C<arindex()>.
|
|
691
|
|
692 =head1 SLICE
|
|
693
|
|
694 use String::Approx 'aslice';
|
|
695
|
|
696 ($index, $size) = aslice("pattern")
|
|
697 ([$i0, $s0], ...) = aslice("pattern", @inputs)
|
|
698 ($index, $size) = aslice("pattern", [ modifiers ])
|
|
699 ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs)
|
|
700
|
|
701 Like C<aindex()> but returns also the size (length) of the match.
|
|
702 If the match fails, returns an empty list (when matching against C<$_>)
|
|
703 or an empty anonymous list corresponding to the particular input.
|
|
704
|
|
705 NOTE: size of the match will very probably be something you did not
|
|
706 expect (such as longer than the pattern, or a negative number). This
|
|
707 may or may not be fixed in future releases. Also the beginning of the
|
|
708 match may vary from the expected as with aindex(), see above.
|
|
709
|
|
710 If the modifier
|
|
711
|
|
712 "minimal_distance"
|
|
713
|
|
714 is used, the minimal possible edit distance is returned as the
|
|
715 third element:
|
|
716
|
|
717 ($index, $size, $distance) = aslice("pattern", [ modifiers ])
|
|
718 ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs)
|
|
719
|
|
720 =head1 DISTANCE
|
|
721
|
|
722 use String::Approx 'adist';
|
|
723
|
|
724 $dist = adist("pattern", $input);
|
|
725 @dist = adist("pattern", @input);
|
|
726
|
|
727 Return the I<edit distance> or distances between the pattern and the
|
|
728 input or inputs. Zero edit distance means exact match. (Remember
|
|
729 that the match can 'float' in the inputs, the match is a substring
|
|
730 match.) If the pattern is longer than the input or inputs, the
|
|
731 returned distance or distances is or are negative.
|
|
732
|
|
733 use String::Approx 'adistr';
|
|
734
|
|
735 $dist = adistr("pattern", $input);
|
|
736 @dist = adistr("pattern", @inputs);
|
|
737
|
|
738 Return the B<relative> I<edit distance> or distances between the
|
|
739 pattern and the input or inputs. Zero relative edit distance means
|
|
740 exact match, one means completely different. (Remember that the
|
|
741 match can 'float' in the inputs, the match is a substring match.) If
|
|
742 the pattern is longer than the input or inputs, the returned distance
|
|
743 or distances is or are negative.
|
|
744
|
|
745 You can use adist() or adistr() to sort the inputs according to their
|
|
746 approximateness:
|
|
747
|
|
748 my %d;
|
|
749 @d{@inputs} = map { abs } adistr("pattern", @inputs);
|
|
750 my @d = sort { $d{$a} <=> $d{$b} } @inputs;
|
|
751
|
|
752 Now C<@d> contains the inputs, the most like C<"pattern"> first.
|
|
753
|
|
754 =head1 CONTROLLING THE CACHE
|
|
755
|
|
756 C<String::Approx> maintains a LU (least-used) cache that holds the
|
|
757 'matching engines' for each instance of a I<pattern+modifiers>. The
|
|
758 cache is intended to help the case where you match a small set of
|
|
759 patterns against a large set of string. However, the more engines you
|
|
760 cache the more you eat memory. If you have a lot of different
|
|
761 patterns or if you have a lot of memory to burn, you may want to
|
|
762 control the cache yourself. For example, allowing a larger cache
|
|
763 consumes more memory but probably runs a little bit faster since the
|
|
764 cache fills (and needs flushing) less often.
|
|
765
|
|
766 The cache has two parameters: I<max> and I<purge>. The first one
|
|
767 is the maximum size of the cache and the second one is the cache
|
|
768 flushing ratio: when the number of cache entries exceeds I<max>,
|
|
769 I<max> times I<purge> cache entries are flushed. The default
|
|
770 values are 1000 and 0.75, respectively, which means that when
|
|
771 the 1001st entry would be cached, 750 least used entries will
|
|
772 be removed from the cache. To access the parameters you can
|
|
773 use the calls
|
|
774
|
|
775 $now_max = String::Approx::cache_max();
|
|
776 String::Approx::cache_max($new_max);
|
|
777
|
|
778 $now_purge = String::Approx::cache_purge();
|
|
779 String::Approx::cache_purge($new_purge);
|
|
780
|
|
781 $limit = String::Approx::cache_n_purge();
|
|
782
|
|
783 To be honest, there are actually B<two> caches: the first one is used
|
|
784 far the patterns with no modifiers, the second one for the patterns
|
|
785 with pattern modifiers. Using the standard parameters you will
|
|
786 therefore actually cache up to 2000 entries. The above calls control
|
|
787 both caches for the same price.
|
|
788
|
|
789 To disable caching completely use
|
|
790
|
|
791 String::Approx::cache_disable();
|
|
792
|
|
793 Note that this doesn't flush any possibly existing cache entries,
|
|
794 to do that use
|
|
795
|
|
796 String::Approx::cache_flush_all();
|
|
797
|
|
798 =head1 NOTES
|
|
799
|
|
800 Because matching is by I<substrings>, not by whole strings, insertions
|
|
801 and substitutions produce often very similar results: "abcde" matches
|
|
802 "axbcde" either by insertion B<or> substitution of "x".
|
|
803
|
|
804 The maximum edit distance is also the maximum number of edits.
|
|
805 That is, the B<"I2"> in
|
|
806
|
|
807 amatch("abcd", ["I2"])
|
|
808
|
|
809 is useless because the maximum edit distance is (implicitly) 1.
|
|
810 You may have meant to say
|
|
811
|
|
812 amatch("abcd", ["2D1S1"])
|
|
813
|
|
814 or something like that.
|
|
815
|
|
816 If you want to simulate transposes
|
|
817
|
|
818 feet fete
|
|
819
|
|
820 you need to allow at least edit distance of two because in terms of
|
|
821 our edit primitives a transpose is first one deletion and then one
|
|
822 insertion.
|
|
823
|
|
824 =head2 TEXT POSITION
|
|
825
|
|
826 The starting and ending positions of matching, substituting, indexing, or
|
|
827 slicing can be changed from the beginning and end of the input(s) to
|
|
828 some other positions by using either or both of the modifiers
|
|
829
|
|
830 "initial_position=24"
|
|
831 "final_position=42"
|
|
832
|
|
833 or the both the modifiers
|
|
834
|
|
835 "initial_position=24"
|
|
836 "position_range=10"
|
|
837
|
|
838 By setting the B<"position_range"> to be zero you can limit
|
|
839 (anchor) the operation to happen only once (if a match is possible)
|
|
840 at the position.
|
|
841
|
|
842 =head1 VERSION
|
|
843
|
|
844 Major release 3.
|
|
845
|
|
846 =head1 CHANGES FROM VERSION 2
|
|
847
|
|
848 =head2 GOOD NEWS
|
|
849
|
|
850 =over 4
|
|
851
|
|
852 =item The version 3 is 2-3 times faster than version 2
|
|
853
|
|
854 =item No pattern length limitation
|
|
855
|
|
856 The algorithm is independent on the pattern length: its time
|
|
857 complexity is I<O(kn)>, where I<k> is the number of edits and I<n> the
|
|
858 length of the text (input). The preprocessing of the pattern will of
|
|
859 course take some I<O(m)> (I<m> being the pattern length) time, but
|
|
860 C<amatch()> and C<asubstitute()> cache the result of this
|
|
861 preprocessing so that it is done only once per pattern.
|
|
862
|
|
863 =back
|
|
864
|
|
865 =head2 BAD NEWS
|
|
866
|
|
867 =over 4
|
|
868
|
|
869 =item You do need a C compiler to install the module
|
|
870
|
|
871 Perl's regular expressions are no more used; instead a faster and more
|
|
872 scalable algorithm written in C is used.
|
|
873
|
|
874 =item C<asubstitute()> is now always stingy
|
|
875
|
|
876 The string matched and substituted is now always stingy, as short
|
|
877 as possible. It used to be as long as possible. This is an unfortunate
|
|
878 change stemming from switching the matching algorithm. Example: with
|
|
879 edit distance of two and substituting for B<"word"> from B<"cork"> and
|
|
880 B<"wool"> previously did match B<"cork"> and B<"wool">. Now it does
|
|
881 match B<"or"> and B<"wo">. As little as possible, or, in other words,
|
|
882 with as much approximateness, as many edits, as possible. Because
|
|
883 there is no I<need> to match the B<"c"> of B<"cork">, it is not matched.
|
|
884
|
|
885 =item no more C<aregex()> because regular expressions are no more used
|
|
886
|
|
887 =item no more C<compat1> for String::Approx version 1 compatibility
|
|
888
|
|
889 =back
|
|
890
|
|
891 =head1 ACKNOWLEDGEMENTS
|
|
892
|
|
893 The following people have provided valuable test cases, documentation
|
|
894 clarifications, and other feedback:
|
|
895
|
|
896 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz,
|
|
897 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda,
|
|
898 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner,
|
|
899 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton,
|
|
900 'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley,
|
|
901 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman,
|
|
902 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park,
|
|
903 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic,
|
|
904 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin,
|
|
905 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith,
|
|
906 Frank Tobin, Greg Ward, Rich Williams, Rick Wise.
|
|
907
|
|
908 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra
|
|
909 Gopal in the Department of Computer Science, University of Arizona.
|
|
910
|
|
911 =head1 AUTHOR
|
|
912
|
|
913 Jarkko Hietaniemi <jhi@iki.fi>
|
|
914
|
|
915 =head1 COPYRIGHT AND LICENSE
|
|
916
|
|
917 Copyright 2001-2013 by Jarkko Hietaniemi
|
|
918
|
|
919 This library is free software; you can redistribute it and/or modify
|
|
920 under either the terms of the Artistic License 2.0, or the GNU Library
|
|
921 General Public License, Version 2. See the files Artistic and LGPL
|
|
922 for more details.
|
|
923
|
|
924 Furthermore: no warranties or obligations of any kind are given, and
|
|
925 the separate file F<COPYRIGHT> must be included intact in all copies
|
|
926 and derived materials.
|
|
927
|
|
928 =cut
|