comparison 2.4/man/man3/String::Approx.3pm @ 13:e3609c8714fb draft

Uploaded
author plus91-technologies-pvt-ltd
date Fri, 30 May 2014 03:37:55 -0400
parents
children
comparison
equal deleted inserted replaced
12:980fce54e60a 13:e3609c8714fb
1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16)
2 .\"
3 .\" Standard preamble:
4 .\" ========================================================================
5 .de Sp \" Vertical space (when we can't use .PP)
6 .if t .sp .5v
7 .if n .sp
8 ..
9 .de Vb \" Begin verbatim text
10 .ft CW
11 .nf
12 .ne \\$1
13 ..
14 .de Ve \" End verbatim text
15 .ft R
16 .fi
17 ..
18 .\" Set up some character translations and predefined strings. \*(-- will
19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will
21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and
22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
23 .\" nothing in troff, for use with C<>.
24 .tr \(*W-
25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
26 .ie n \{\
27 . ds -- \(*W-
28 . ds PI pi
29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
31 . ds L" ""
32 . ds R" ""
33 . ds C` ""
34 . ds C' ""
35 'br\}
36 .el\{\
37 . ds -- \|\(em\|
38 . ds PI \(*p
39 . ds L" ``
40 . ds R" ''
41 'br\}
42 .\"
43 .\" Escape single quotes in literal strings from groff's Unicode transform.
44 .ie \n(.g .ds Aq \(aq
45 .el .ds Aq '
46 .\"
47 .\" If the F register is turned on, we'll generate index entries on stderr for
48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
49 .\" entries marked with X<> in POD. Of course, you'll have to process the
50 .\" output yourself in some meaningful fashion.
51 .ie \nF \{\
52 . de IX
53 . tm Index:\\$1\t\\n%\t"\\$2"
54 ..
55 . nr % 0
56 . rr F
57 .\}
58 .el \{\
59 . de IX
60 ..
61 .\}
62 .\"
63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
64 .\" Fear. Run. Save yourself. No user-serviceable parts.
65 . \" fudge factors for nroff and troff
66 .if n \{\
67 . ds #H 0
68 . ds #V .8m
69 . ds #F .3m
70 . ds #[ \f1
71 . ds #] \fP
72 .\}
73 .if t \{\
74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75 . ds #V .6m
76 . ds #F 0
77 . ds #[ \&
78 . ds #] \&
79 .\}
80 . \" simple accents for nroff and troff
81 .if n \{\
82 . ds ' \&
83 . ds ` \&
84 . ds ^ \&
85 . ds , \&
86 . ds ~ ~
87 . ds /
88 .\}
89 .if t \{\
90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
96 .\}
97 . \" troff and (daisy-wheel) nroff accents
98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105 .ds ae a\h'-(\w'a'u*4/10)'e
106 .ds Ae A\h'-(\w'A'u*4/10)'E
107 . \" corrections for vroff
108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110 . \" for low resolution devices (crt and lpr)
111 .if \n(.H>23 .if \n(.V>19 \
112 \{\
113 . ds : e
114 . ds 8 ss
115 . ds o a
116 . ds d- d\h'-1'\(ga
117 . ds D- D\h'-1'\(hy
118 . ds th \o'bp'
119 . ds Th \o'LP'
120 . ds ae ae
121 . ds Ae AE
122 .\}
123 .rm #[ #] #H #V #F C
124 .\" ========================================================================
125 .\"
126 .IX Title "Approx 3pm"
127 .TH Approx 3pm "2013-01-22" "perl v5.14.2" "User Contributed Perl Documentation"
128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes
129 .\" way too many mistakes in technical documents.
130 .if n .ad l
131 .nh
132 .SH "NAME"
133 String::Approx \- Perl extension for approximate matching (fuzzy matching)
134 .SH "SYNOPSIS"
135 .IX Header "SYNOPSIS"
136 .Vb 1
137 \& use String::Approx \*(Aqamatch\*(Aq;
138 \&
139 \& print if amatch("foobar");
140 \&
141 \& my @matches = amatch("xyzzy", @inputs);
142 \&
143 \& my @catches = amatch("plugh", [\*(Aq2\*(Aq], @inputs);
144 .Ve
145 .SH "DESCRIPTION"
146 .IX Header "DESCRIPTION"
147 String::Approx lets you match and substitute strings approximately.
148 With this you can emulate errors: typing errorrs, speling errors,
149 closely related vocabularies (colour color), genetic mutations (\s-1GAG\s0
150 \&\s-1ACT\s0), abbreviations (McScot, MacScot).
151 .PP
152 \&\s-1NOTE:\s0 String::Approx suits the task of \fBstring matching\fR, not
153 \&\fBstring comparison\fR, and it works for \fBstrings\fR, not for \fBtext\fR.
154 .PP
155 If you want to compare strings for similarity, you probably just want
156 the Levenshtein edit distance (explained below), the Text::Levenshtein
157 and Text::LevenshteinXS modules in \s-1CPAN\s0. See also Text::WagnerFischer
158 and Text::PhraseDistance. (There are functions for this in String::Approx,
159 e.g. \fIadist()\fR, but their results sometimes differ from the bare Levenshtein
160 et al.)
161 .PP
162 If you want to compare things like text or source code, consisting of
163 \&\fBwords\fR or \fBtokens\fR and \fBphrases\fR and \fBsentences\fR, or
164 \&\fBexpressions\fR and \fBstatements\fR, you should probably use some other
165 tool than String::Approx, like for example the standard \s-1UNIX\s0 \fIdiff\fR\|(1)
166 tool, or the Algorithm::Diff module from \s-1CPAN\s0.
167 .PP
168 The measure of \fBapproximateness\fR is the \fILevenshtein edit distance\fR.
169 It is the total number of \*(L"edits\*(R": insertions,
170 .PP
171 .Vb 1
172 \& word world
173 .Ve
174 .PP
175 deletions,
176 .PP
177 .Vb 1
178 \& monkey money
179 .Ve
180 .PP
181 and substitutions
182 .PP
183 .Vb 1
184 \& sun fun
185 .Ve
186 .PP
187 required to transform a string to another string. For example, to
188 transform \fI\*(L"lead\*(R"\fR into \fI\*(L"gold\*(R"\fR, you need three edits:
189 .PP
190 .Vb 1
191 \& lead gead goad gold
192 .Ve
193 .PP
194 The edit distance of \*(L"lead\*(R" and \*(L"gold\*(R" is therefore three, or 75%.
195 .PP
196 \&\fBString::Approx\fR uses the Levenshtein edit distance as its measure, but
197 String::Approx is not well-suited for comparing strings of different
198 length, in other words, if you want a \*(L"fuzzy eq\*(R", see above.
199 String::Approx is more like regular expressions or \fIindex()\fR, it finds
200 substrings that are close matches.>
201 .SH "MATCH"
202 .IX Header "MATCH"
203 .Vb 1
204 \& use String::Approx \*(Aqamatch\*(Aq;
205 \&
206 \& $matched = amatch("pattern")
207 \& $matched = amatch("pattern", [ modifiers ])
208 \&
209 \& $any_matched = amatch("pattern", @inputs)
210 \& $any_matched = amatch("pattern", [ modifiers ], @inputs)
211 \&
212 \& @match = amatch("pattern")
213 \& @match = amatch("pattern", [ modifiers ])
214 \&
215 \& @matches = amatch("pattern", @inputs)
216 \& @matches = amatch("pattern", [ modifiers ], @inputs)
217 .Ve
218 .PP
219 Match \fBpattern\fR approximately. In list context return the matched
220 \&\fB\f(CB@inputs\fB\fR. If no inputs are given, match against the \fB\f(CB$_\fB\fR. In scalar
221 context return true if \fIany\fR of the inputs match, false if none match.
222 .PP
223 Notice that the pattern is a string. Not a regular expression. None
224 of the regular expression notations (^, ., *, and so on) work. They
225 are characters just like the others. Note-on-note: some limited form
226 of \fI\*(L"regular expressionism\*(R"\fR is planned in future: for example
227 character classes ([abc]) and \fIany-chars\fR (.). But that feature will
228 be turned on by a special \fImodifier\fR (just a guess: \*(L"r\*(R"), so there
229 should be no backward compatibility problem.
230 .PP
231 Notice also that matching is not symmetric. The inputs are matched
232 against the pattern, not the other way round. In other words: the
233 pattern can be a substring, a submatch, of an input element. An input
234 element is always a superstring of the pattern.
235 .SS "\s-1MODIFIERS\s0"
236 .IX Subsection "MODIFIERS"
237 With the modifiers you can control the amount of approximateness and
238 certain other control variables. The modifiers are one or more
239 strings, for example \fB\*(L"i\*(R"\fR, within a string optionally separated by
240 whitespace. The modifiers are inside an anonymous array: the \fB[ ]\fR
241 in the syntax are not notational, they really do mean \fB[ ]\fR, for
242 example \fB[ \*(L"i\*(R", \*(L"2\*(R" ]\fR. \fB[\*(L"2 i\*(R"]\fR would be identical.
243 .PP
244 The implicit default approximateness is 10%, rounded up. In other
245 words: every tenth character in the pattern may be an error, an edit.
246 You can explicitly set the maximum approximateness by supplying a
247 modifier like
248 .PP
249 .Vb 2
250 \& number
251 \& number%
252 .Ve
253 .PP
254 Examples: \fB\*(L"3\*(R"\fR, \fB\*(L"15%\*(R"\fR.
255 .PP
256 Note that \f(CW\*(C`0%\*(C'\fR is not rounded up, it is equal to \f(CW0\fR.
257 .PP
258 Using a similar syntax you can separately control the maximum number
259 of insertions, deletions, and substitutions by prefixing the numbers
260 with I, D, or S, like this:
261 .PP
262 .Vb 6
263 \& Inumber
264 \& Inumber%
265 \& Dnumber
266 \& Dnumber%
267 \& Snumber
268 \& Snumber%
269 .Ve
270 .PP
271 Examples: \fB\*(L"I2\*(R"\fR, \fB\*(L"D20%\*(R"\fR, \fB\*(L"S0\*(R"\fR.
272 .PP
273 You can ignore case (\fB\*(L"A\*(R"\fR becames equal to \fB\*(L"a\*(R"\fR and vice versa)
274 by adding the \fB\*(L"i\*(R"\fR modifier.
275 .PP
276 For example
277 .PP
278 .Vb 1
279 \& [ "i 25%", "S0" ]
280 .Ve
281 .PP
282 means \fIignore case\fR, \fIallow every fourth character to be \*(L"an edit\*(R"\fR,
283 but allow \fIno substitutions\fR. (See \s-1NOTES\s0 about disallowing
284 substitutions or insertions.)
285 .PP
286 \&\s-1NOTE:\s0 setting \f(CW\*(C`I0 D0 S0\*(C'\fR is not equivalent to using \fIindex()\fR.
287 If you want to use \fIindex()\fR, use \fIindex()\fR.
288 .SH "SUBSTITUTE"
289 .IX Header "SUBSTITUTE"
290 .Vb 1
291 \& use String::Approx \*(Aqasubstitute\*(Aq;
292 \&
293 \& @substituted = asubstitute("pattern", "replacement")
294 \& @substituted = asubstitute("pattern", "replacement", @inputs)
295 \& @substituted = asubstitute("pattern", "replacement", [ modifiers ])
296 \& @substituted = asubstitute("pattern", "replacement",
297 \& [ modifiers ], @inputs)
298 .Ve
299 .PP
300 Substitute approximate \fBpattern\fR with \fBreplacement\fR and return as a
301 list <copies> of \fB\f(CB@inputs\fB\fR, the substitutions having been made on the
302 elements that did match the pattern. If no inputs are given,
303 substitute in the \fB\f(CB$_\fB\fR. The replacement can contain magic strings
304 \&\fB$&\fR, \fB$`\fR, \fB$'\fR that stand for the matched string, the string
305 before it, and the string after it, respectively. All the other
306 arguments are as in \f(CW\*(C`amatch()\*(C'\fR, plus one additional modifier, \fB\*(L"g\*(R"\fR
307 which means substitute globally (all the matches in an element and not
308 just the first one, as is the default).
309 .PP
310 See \*(L"\s-1BAD\s0 \s-1NEWS\s0\*(R" about the unfortunate stinginess of \f(CW\*(C`asubstitute()\*(C'\fR.
311 .SH "INDEX"
312 .IX Header "INDEX"
313 .Vb 1
314 \& use String::Approx \*(Aqaindex\*(Aq;
315 \&
316 \& $index = aindex("pattern")
317 \& @indices = aindex("pattern", @inputs)
318 \& $index = aindex("pattern", [ modifiers ])
319 \& @indices = aindex("pattern", [ modifiers ], @inputs)
320 .Ve
321 .PP
322 Like \f(CW\*(C`amatch()\*(C'\fR but returns the index/indices at which the pattern
323 matches approximately. In list context and if \f(CW@inputs\fR are used,
324 returns a list of indices, one index for each input element.
325 If there's no approximate match, \f(CW\*(C`\-1\*(C'\fR is returned as the index.
326 .PP
327 \&\s-1NOTE:\s0 if there is character repetition (e.g. \*(L"aa\*(R") either in
328 the pattern or in the text, the returned index might start
329 \&\*(L"too early\*(R". This is consistent with the goal of the module
330 of matching \*(L"as early as possible\*(R", just like regular expressions
331 (that there might be a \*(L"less approximate\*(R" match starting later is
332 of somewhat irrelevant).
333 .PP
334 There's also backwards-scanning \f(CW\*(C`arindex()\*(C'\fR.
335 .SH "SLICE"
336 .IX Header "SLICE"
337 .Vb 1
338 \& use String::Approx \*(Aqaslice\*(Aq;
339 \&
340 \& ($index, $size) = aslice("pattern")
341 \& ([$i0, $s0], ...) = aslice("pattern", @inputs)
342 \& ($index, $size) = aslice("pattern", [ modifiers ])
343 \& ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs)
344 .Ve
345 .PP
346 Like \f(CW\*(C`aindex()\*(C'\fR but returns also the size (length) of the match.
347 If the match fails, returns an empty list (when matching against \f(CW$_\fR)
348 or an empty anonymous list corresponding to the particular input.
349 .PP
350 \&\s-1NOTE:\s0 size of the match will very probably be something you did not
351 expect (such as longer than the pattern, or a negative number). This
352 may or may not be fixed in future releases. Also the beginning of the
353 match may vary from the expected as with \fIaindex()\fR, see above.
354 .PP
355 If the modifier
356 .PP
357 .Vb 1
358 \& "minimal_distance"
359 .Ve
360 .PP
361 is used, the minimal possible edit distance is returned as the
362 third element:
363 .PP
364 .Vb 2
365 \& ($index, $size, $distance) = aslice("pattern", [ modifiers ])
366 \& ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs)
367 .Ve
368 .SH "DISTANCE"
369 .IX Header "DISTANCE"
370 .Vb 1
371 \& use String::Approx \*(Aqadist\*(Aq;
372 \&
373 \& $dist = adist("pattern", $input);
374 \& @dist = adist("pattern", @input);
375 .Ve
376 .PP
377 Return the \fIedit distance\fR or distances between the pattern and the
378 input or inputs. Zero edit distance means exact match. (Remember
379 that the match can 'float' in the inputs, the match is a substring
380 match.) If the pattern is longer than the input or inputs, the
381 returned distance or distances is or are negative.
382 .PP
383 .Vb 1
384 \& use String::Approx \*(Aqadistr\*(Aq;
385 \&
386 \& $dist = adistr("pattern", $input);
387 \& @dist = adistr("pattern", @inputs);
388 .Ve
389 .PP
390 Return the \fBrelative\fR \fIedit distance\fR or distances between the
391 pattern and the input or inputs. Zero relative edit distance means
392 exact match, one means completely different. (Remember that the
393 match can 'float' in the inputs, the match is a substring match.) If
394 the pattern is longer than the input or inputs, the returned distance
395 or distances is or are negative.
396 .PP
397 You can use \fIadist()\fR or \fIadistr()\fR to sort the inputs according to their
398 approximateness:
399 .PP
400 .Vb 3
401 \& my %d;
402 \& @d{@inputs} = map { abs } adistr("pattern", @inputs);
403 \& my @d = sort { $d{$a} <=> $d{$b} } @inputs;
404 .Ve
405 .PP
406 Now \f(CW@d\fR contains the inputs, the most like \f(CW"pattern"\fR first.
407 .SH "CONTROLLING THE CACHE"
408 .IX Header "CONTROLLING THE CACHE"
409 \&\f(CW\*(C`String::Approx\*(C'\fR maintains a \s-1LU\s0 (least-used) cache that holds the
410 \&'matching engines' for each instance of a \fIpattern+modifiers\fR. The
411 cache is intended to help the case where you match a small set of
412 patterns against a large set of string. However, the more engines you
413 cache the more you eat memory. If you have a lot of different
414 patterns or if you have a lot of memory to burn, you may want to
415 control the cache yourself. For example, allowing a larger cache
416 consumes more memory but probably runs a little bit faster since the
417 cache fills (and needs flushing) less often.
418 .PP
419 The cache has two parameters: \fImax\fR and \fIpurge\fR. The first one
420 is the maximum size of the cache and the second one is the cache
421 flushing ratio: when the number of cache entries exceeds \fImax\fR,
422 \&\fImax\fR times \fIpurge\fR cache entries are flushed. The default
423 values are 1000 and 0.75, respectively, which means that when
424 the 1001st entry would be cached, 750 least used entries will
425 be removed from the cache. To access the parameters you can
426 use the calls
427 .PP
428 .Vb 2
429 \& $now_max = String::Approx::cache_max();
430 \& String::Approx::cache_max($new_max);
431 \&
432 \& $now_purge = String::Approx::cache_purge();
433 \& String::Approx::cache_purge($new_purge);
434 \&
435 \& $limit = String::Approx::cache_n_purge();
436 .Ve
437 .PP
438 To be honest, there are actually \fBtwo\fR caches: the first one is used
439 far the patterns with no modifiers, the second one for the patterns
440 with pattern modifiers. Using the standard parameters you will
441 therefore actually cache up to 2000 entries. The above calls control
442 both caches for the same price.
443 .PP
444 To disable caching completely use
445 .PP
446 .Vb 1
447 \& String::Approx::cache_disable();
448 .Ve
449 .PP
450 Note that this doesn't flush any possibly existing cache entries,
451 to do that use
452 .PP
453 .Vb 1
454 \& String::Approx::cache_flush_all();
455 .Ve
456 .SH "NOTES"
457 .IX Header "NOTES"
458 Because matching is by \fIsubstrings\fR, not by whole strings, insertions
459 and substitutions produce often very similar results: \*(L"abcde\*(R" matches
460 \&\*(L"axbcde\*(R" either by insertion \fBor\fR substitution of \*(L"x\*(R".
461 .PP
462 The maximum edit distance is also the maximum number of edits.
463 That is, the \fB\*(L"I2\*(R"\fR in
464 .PP
465 .Vb 1
466 \& amatch("abcd", ["I2"])
467 .Ve
468 .PP
469 is useless because the maximum edit distance is (implicitly) 1.
470 You may have meant to say
471 .PP
472 .Vb 1
473 \& amatch("abcd", ["2D1S1"])
474 .Ve
475 .PP
476 or something like that.
477 .PP
478 If you want to simulate transposes
479 .PP
480 .Vb 1
481 \& feet fete
482 .Ve
483 .PP
484 you need to allow at least edit distance of two because in terms of
485 our edit primitives a transpose is first one deletion and then one
486 insertion.
487 .SS "\s-1TEXT\s0 \s-1POSITION\s0"
488 .IX Subsection "TEXT POSITION"
489 The starting and ending positions of matching, substituting, indexing, or
490 slicing can be changed from the beginning and end of the input(s) to
491 some other positions by using either or both of the modifiers
492 .PP
493 .Vb 2
494 \& "initial_position=24"
495 \& "final_position=42"
496 .Ve
497 .PP
498 or the both the modifiers
499 .PP
500 .Vb 2
501 \& "initial_position=24"
502 \& "position_range=10"
503 .Ve
504 .PP
505 By setting the \fB\*(L"position_range\*(R"\fR to be zero you can limit
506 (anchor) the operation to happen only once (if a match is possible)
507 at the position.
508 .SH "VERSION"
509 .IX Header "VERSION"
510 Major release 3.
511 .SH "CHANGES FROM VERSION 2"
512 .IX Header "CHANGES FROM VERSION 2"
513 .SS "\s-1GOOD\s0 \s-1NEWS\s0"
514 .IX Subsection "GOOD NEWS"
515 .IP "The version 3 is 2\-3 times faster than version 2" 4
516 .IX Item "The version 3 is 2-3 times faster than version 2"
517 .PD 0
518 .IP "No pattern length limitation" 4
519 .IX Item "No pattern length limitation"
520 .PD
521 The algorithm is independent on the pattern length: its time
522 complexity is \fIO(kn)\fR, where \fIk\fR is the number of edits and \fIn\fR the
523 length of the text (input). The preprocessing of the pattern will of
524 course take some \fIO(m)\fR (\fIm\fR being the pattern length) time, but
525 \&\f(CW\*(C`amatch()\*(C'\fR and \f(CW\*(C`asubstitute()\*(C'\fR cache the result of this
526 preprocessing so that it is done only once per pattern.
527 .SS "\s-1BAD\s0 \s-1NEWS\s0"
528 .IX Subsection "BAD NEWS"
529 .IP "You do need a C compiler to install the module" 4
530 .IX Item "You do need a C compiler to install the module"
531 Perl's regular expressions are no more used; instead a faster and more
532 scalable algorithm written in C is used.
533 .ie n .IP """asubstitute()"" is now always stingy" 4
534 .el .IP "\f(CWasubstitute()\fR is now always stingy" 4
535 .IX Item "asubstitute() is now always stingy"
536 The string matched and substituted is now always stingy, as short
537 as possible. It used to be as long as possible. This is an unfortunate
538 change stemming from switching the matching algorithm. Example: with
539 edit distance of two and substituting for \fB\*(L"word\*(R"\fR from \fB\*(L"cork\*(R"\fR and
540 \&\fB\*(L"wool\*(R"\fR previously did match \fB\*(L"cork\*(R"\fR and \fB\*(L"wool\*(R"\fR. Now it does
541 match \fB\*(L"or\*(R"\fR and \fB\*(L"wo\*(R"\fR. As little as possible, or, in other words,
542 with as much approximateness, as many edits, as possible. Because
543 there is no \fIneed\fR to match the \fB\*(L"c\*(R"\fR of \fB\*(L"cork\*(R"\fR, it is not matched.
544 .ie n .IP "no more ""aregex()"" because regular expressions are no more used" 4
545 .el .IP "no more \f(CWaregex()\fR because regular expressions are no more used" 4
546 .IX Item "no more aregex() because regular expressions are no more used"
547 .PD 0
548 .ie n .IP "no more ""compat1"" for String::Approx version 1 compatibility" 4
549 .el .IP "no more \f(CWcompat1\fR for String::Approx version 1 compatibility" 4
550 .IX Item "no more compat1 for String::Approx version 1 compatibility"
551 .PD
552 .SH "ACKNOWLEDGEMENTS"
553 .IX Header "ACKNOWLEDGEMENTS"
554 The following people have provided valuable test cases, documentation
555 clarifications, and other feedback:
556 .PP
557 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz,
558 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda,
559 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner,
560 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton,
561 \&'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley,
562 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman,
563 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park,
564 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic,
565 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin,
566 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith,
567 Frank Tobin, Greg Ward, Rich Williams, Rick Wise.
568 .PP
569 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra
570 Gopal in the Department of Computer Science, University of Arizona.
571 .SH "AUTHOR"
572 .IX Header "AUTHOR"
573 Jarkko Hietaniemi <jhi@iki.fi>
574 .SH "COPYRIGHT AND LICENSE"
575 .IX Header "COPYRIGHT AND LICENSE"
576 Copyright 2001\-2013 by Jarkko Hietaniemi
577 .PP
578 This library is free software; you can redistribute it and/or modify
579 under either the terms of the Artistic License 2.0, or the \s-1GNU\s0 Library
580 General Public License, Version 2. See the files Artistic and \s-1LGPL\s0
581 for more details.
582 .PP
583 Furthermore: no warranties or obligations of any kind are given, and
584 the separate file \fI\s-1COPYRIGHT\s0\fR must be included intact in all copies
585 and derived materials.