Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison 2.4/man/man3/String::Approx.3pm @ 13:e3609c8714fb draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Fri, 30 May 2014 03:37:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:980fce54e60a | 13:e3609c8714fb |
---|---|
1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16) | |
2 .\" | |
3 .\" Standard preamble: | |
4 .\" ======================================================================== | |
5 .de Sp \" Vertical space (when we can't use .PP) | |
6 .if t .sp .5v | |
7 .if n .sp | |
8 .. | |
9 .de Vb \" Begin verbatim text | |
10 .ft CW | |
11 .nf | |
12 .ne \\$1 | |
13 .. | |
14 .de Ve \" End verbatim text | |
15 .ft R | |
16 .fi | |
17 .. | |
18 .\" Set up some character translations and predefined strings. \*(-- will | |
19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will | |
21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and | |
22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, | |
23 .\" nothing in troff, for use with C<>. | |
24 .tr \(*W- | |
25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
26 .ie n \{\ | |
27 . ds -- \(*W- | |
28 . ds PI pi | |
29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
31 . ds L" "" | |
32 . ds R" "" | |
33 . ds C` "" | |
34 . ds C' "" | |
35 'br\} | |
36 .el\{\ | |
37 . ds -- \|\(em\| | |
38 . ds PI \(*p | |
39 . ds L" `` | |
40 . ds R" '' | |
41 'br\} | |
42 .\" | |
43 .\" Escape single quotes in literal strings from groff's Unicode transform. | |
44 .ie \n(.g .ds Aq \(aq | |
45 .el .ds Aq ' | |
46 .\" | |
47 .\" If the F register is turned on, we'll generate index entries on stderr for | |
48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index | |
49 .\" entries marked with X<> in POD. Of course, you'll have to process the | |
50 .\" output yourself in some meaningful fashion. | |
51 .ie \nF \{\ | |
52 . de IX | |
53 . tm Index:\\$1\t\\n%\t"\\$2" | |
54 .. | |
55 . nr % 0 | |
56 . rr F | |
57 .\} | |
58 .el \{\ | |
59 . de IX | |
60 .. | |
61 .\} | |
62 .\" | |
63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
64 .\" Fear. Run. Save yourself. No user-serviceable parts. | |
65 . \" fudge factors for nroff and troff | |
66 .if n \{\ | |
67 . ds #H 0 | |
68 . ds #V .8m | |
69 . ds #F .3m | |
70 . ds #[ \f1 | |
71 . ds #] \fP | |
72 .\} | |
73 .if t \{\ | |
74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
75 . ds #V .6m | |
76 . ds #F 0 | |
77 . ds #[ \& | |
78 . ds #] \& | |
79 .\} | |
80 . \" simple accents for nroff and troff | |
81 .if n \{\ | |
82 . ds ' \& | |
83 . ds ` \& | |
84 . ds ^ \& | |
85 . ds , \& | |
86 . ds ~ ~ | |
87 . ds / | |
88 .\} | |
89 .if t \{\ | |
90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
96 .\} | |
97 . \" troff and (daisy-wheel) nroff accents | |
98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
105 .ds ae a\h'-(\w'a'u*4/10)'e | |
106 .ds Ae A\h'-(\w'A'u*4/10)'E | |
107 . \" corrections for vroff | |
108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
110 . \" for low resolution devices (crt and lpr) | |
111 .if \n(.H>23 .if \n(.V>19 \ | |
112 \{\ | |
113 . ds : e | |
114 . ds 8 ss | |
115 . ds o a | |
116 . ds d- d\h'-1'\(ga | |
117 . ds D- D\h'-1'\(hy | |
118 . ds th \o'bp' | |
119 . ds Th \o'LP' | |
120 . ds ae ae | |
121 . ds Ae AE | |
122 .\} | |
123 .rm #[ #] #H #V #F C | |
124 .\" ======================================================================== | |
125 .\" | |
126 .IX Title "Approx 3pm" | |
127 .TH Approx 3pm "2013-01-22" "perl v5.14.2" "User Contributed Perl Documentation" | |
128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
129 .\" way too many mistakes in technical documents. | |
130 .if n .ad l | |
131 .nh | |
132 .SH "NAME" | |
133 String::Approx \- Perl extension for approximate matching (fuzzy matching) | |
134 .SH "SYNOPSIS" | |
135 .IX Header "SYNOPSIS" | |
136 .Vb 1 | |
137 \& use String::Approx \*(Aqamatch\*(Aq; | |
138 \& | |
139 \& print if amatch("foobar"); | |
140 \& | |
141 \& my @matches = amatch("xyzzy", @inputs); | |
142 \& | |
143 \& my @catches = amatch("plugh", [\*(Aq2\*(Aq], @inputs); | |
144 .Ve | |
145 .SH "DESCRIPTION" | |
146 .IX Header "DESCRIPTION" | |
147 String::Approx lets you match and substitute strings approximately. | |
148 With this you can emulate errors: typing errorrs, speling errors, | |
149 closely related vocabularies (colour color), genetic mutations (\s-1GAG\s0 | |
150 \&\s-1ACT\s0), abbreviations (McScot, MacScot). | |
151 .PP | |
152 \&\s-1NOTE:\s0 String::Approx suits the task of \fBstring matching\fR, not | |
153 \&\fBstring comparison\fR, and it works for \fBstrings\fR, not for \fBtext\fR. | |
154 .PP | |
155 If you want to compare strings for similarity, you probably just want | |
156 the Levenshtein edit distance (explained below), the Text::Levenshtein | |
157 and Text::LevenshteinXS modules in \s-1CPAN\s0. See also Text::WagnerFischer | |
158 and Text::PhraseDistance. (There are functions for this in String::Approx, | |
159 e.g. \fIadist()\fR, but their results sometimes differ from the bare Levenshtein | |
160 et al.) | |
161 .PP | |
162 If you want to compare things like text or source code, consisting of | |
163 \&\fBwords\fR or \fBtokens\fR and \fBphrases\fR and \fBsentences\fR, or | |
164 \&\fBexpressions\fR and \fBstatements\fR, you should probably use some other | |
165 tool than String::Approx, like for example the standard \s-1UNIX\s0 \fIdiff\fR\|(1) | |
166 tool, or the Algorithm::Diff module from \s-1CPAN\s0. | |
167 .PP | |
168 The measure of \fBapproximateness\fR is the \fILevenshtein edit distance\fR. | |
169 It is the total number of \*(L"edits\*(R": insertions, | |
170 .PP | |
171 .Vb 1 | |
172 \& word world | |
173 .Ve | |
174 .PP | |
175 deletions, | |
176 .PP | |
177 .Vb 1 | |
178 \& monkey money | |
179 .Ve | |
180 .PP | |
181 and substitutions | |
182 .PP | |
183 .Vb 1 | |
184 \& sun fun | |
185 .Ve | |
186 .PP | |
187 required to transform a string to another string. For example, to | |
188 transform \fI\*(L"lead\*(R"\fR into \fI\*(L"gold\*(R"\fR, you need three edits: | |
189 .PP | |
190 .Vb 1 | |
191 \& lead gead goad gold | |
192 .Ve | |
193 .PP | |
194 The edit distance of \*(L"lead\*(R" and \*(L"gold\*(R" is therefore three, or 75%. | |
195 .PP | |
196 \&\fBString::Approx\fR uses the Levenshtein edit distance as its measure, but | |
197 String::Approx is not well-suited for comparing strings of different | |
198 length, in other words, if you want a \*(L"fuzzy eq\*(R", see above. | |
199 String::Approx is more like regular expressions or \fIindex()\fR, it finds | |
200 substrings that are close matches.> | |
201 .SH "MATCH" | |
202 .IX Header "MATCH" | |
203 .Vb 1 | |
204 \& use String::Approx \*(Aqamatch\*(Aq; | |
205 \& | |
206 \& $matched = amatch("pattern") | |
207 \& $matched = amatch("pattern", [ modifiers ]) | |
208 \& | |
209 \& $any_matched = amatch("pattern", @inputs) | |
210 \& $any_matched = amatch("pattern", [ modifiers ], @inputs) | |
211 \& | |
212 \& @match = amatch("pattern") | |
213 \& @match = amatch("pattern", [ modifiers ]) | |
214 \& | |
215 \& @matches = amatch("pattern", @inputs) | |
216 \& @matches = amatch("pattern", [ modifiers ], @inputs) | |
217 .Ve | |
218 .PP | |
219 Match \fBpattern\fR approximately. In list context return the matched | |
220 \&\fB\f(CB@inputs\fB\fR. If no inputs are given, match against the \fB\f(CB$_\fB\fR. In scalar | |
221 context return true if \fIany\fR of the inputs match, false if none match. | |
222 .PP | |
223 Notice that the pattern is a string. Not a regular expression. None | |
224 of the regular expression notations (^, ., *, and so on) work. They | |
225 are characters just like the others. Note-on-note: some limited form | |
226 of \fI\*(L"regular expressionism\*(R"\fR is planned in future: for example | |
227 character classes ([abc]) and \fIany-chars\fR (.). But that feature will | |
228 be turned on by a special \fImodifier\fR (just a guess: \*(L"r\*(R"), so there | |
229 should be no backward compatibility problem. | |
230 .PP | |
231 Notice also that matching is not symmetric. The inputs are matched | |
232 against the pattern, not the other way round. In other words: the | |
233 pattern can be a substring, a submatch, of an input element. An input | |
234 element is always a superstring of the pattern. | |
235 .SS "\s-1MODIFIERS\s0" | |
236 .IX Subsection "MODIFIERS" | |
237 With the modifiers you can control the amount of approximateness and | |
238 certain other control variables. The modifiers are one or more | |
239 strings, for example \fB\*(L"i\*(R"\fR, within a string optionally separated by | |
240 whitespace. The modifiers are inside an anonymous array: the \fB[ ]\fR | |
241 in the syntax are not notational, they really do mean \fB[ ]\fR, for | |
242 example \fB[ \*(L"i\*(R", \*(L"2\*(R" ]\fR. \fB[\*(L"2 i\*(R"]\fR would be identical. | |
243 .PP | |
244 The implicit default approximateness is 10%, rounded up. In other | |
245 words: every tenth character in the pattern may be an error, an edit. | |
246 You can explicitly set the maximum approximateness by supplying a | |
247 modifier like | |
248 .PP | |
249 .Vb 2 | |
250 \& number | |
251 \& number% | |
252 .Ve | |
253 .PP | |
254 Examples: \fB\*(L"3\*(R"\fR, \fB\*(L"15%\*(R"\fR. | |
255 .PP | |
256 Note that \f(CW\*(C`0%\*(C'\fR is not rounded up, it is equal to \f(CW0\fR. | |
257 .PP | |
258 Using a similar syntax you can separately control the maximum number | |
259 of insertions, deletions, and substitutions by prefixing the numbers | |
260 with I, D, or S, like this: | |
261 .PP | |
262 .Vb 6 | |
263 \& Inumber | |
264 \& Inumber% | |
265 \& Dnumber | |
266 \& Dnumber% | |
267 \& Snumber | |
268 \& Snumber% | |
269 .Ve | |
270 .PP | |
271 Examples: \fB\*(L"I2\*(R"\fR, \fB\*(L"D20%\*(R"\fR, \fB\*(L"S0\*(R"\fR. | |
272 .PP | |
273 You can ignore case (\fB\*(L"A\*(R"\fR becames equal to \fB\*(L"a\*(R"\fR and vice versa) | |
274 by adding the \fB\*(L"i\*(R"\fR modifier. | |
275 .PP | |
276 For example | |
277 .PP | |
278 .Vb 1 | |
279 \& [ "i 25%", "S0" ] | |
280 .Ve | |
281 .PP | |
282 means \fIignore case\fR, \fIallow every fourth character to be \*(L"an edit\*(R"\fR, | |
283 but allow \fIno substitutions\fR. (See \s-1NOTES\s0 about disallowing | |
284 substitutions or insertions.) | |
285 .PP | |
286 \&\s-1NOTE:\s0 setting \f(CW\*(C`I0 D0 S0\*(C'\fR is not equivalent to using \fIindex()\fR. | |
287 If you want to use \fIindex()\fR, use \fIindex()\fR. | |
288 .SH "SUBSTITUTE" | |
289 .IX Header "SUBSTITUTE" | |
290 .Vb 1 | |
291 \& use String::Approx \*(Aqasubstitute\*(Aq; | |
292 \& | |
293 \& @substituted = asubstitute("pattern", "replacement") | |
294 \& @substituted = asubstitute("pattern", "replacement", @inputs) | |
295 \& @substituted = asubstitute("pattern", "replacement", [ modifiers ]) | |
296 \& @substituted = asubstitute("pattern", "replacement", | |
297 \& [ modifiers ], @inputs) | |
298 .Ve | |
299 .PP | |
300 Substitute approximate \fBpattern\fR with \fBreplacement\fR and return as a | |
301 list <copies> of \fB\f(CB@inputs\fB\fR, the substitutions having been made on the | |
302 elements that did match the pattern. If no inputs are given, | |
303 substitute in the \fB\f(CB$_\fB\fR. The replacement can contain magic strings | |
304 \&\fB$&\fR, \fB$`\fR, \fB$'\fR that stand for the matched string, the string | |
305 before it, and the string after it, respectively. All the other | |
306 arguments are as in \f(CW\*(C`amatch()\*(C'\fR, plus one additional modifier, \fB\*(L"g\*(R"\fR | |
307 which means substitute globally (all the matches in an element and not | |
308 just the first one, as is the default). | |
309 .PP | |
310 See \*(L"\s-1BAD\s0 \s-1NEWS\s0\*(R" about the unfortunate stinginess of \f(CW\*(C`asubstitute()\*(C'\fR. | |
311 .SH "INDEX" | |
312 .IX Header "INDEX" | |
313 .Vb 1 | |
314 \& use String::Approx \*(Aqaindex\*(Aq; | |
315 \& | |
316 \& $index = aindex("pattern") | |
317 \& @indices = aindex("pattern", @inputs) | |
318 \& $index = aindex("pattern", [ modifiers ]) | |
319 \& @indices = aindex("pattern", [ modifiers ], @inputs) | |
320 .Ve | |
321 .PP | |
322 Like \f(CW\*(C`amatch()\*(C'\fR but returns the index/indices at which the pattern | |
323 matches approximately. In list context and if \f(CW@inputs\fR are used, | |
324 returns a list of indices, one index for each input element. | |
325 If there's no approximate match, \f(CW\*(C`\-1\*(C'\fR is returned as the index. | |
326 .PP | |
327 \&\s-1NOTE:\s0 if there is character repetition (e.g. \*(L"aa\*(R") either in | |
328 the pattern or in the text, the returned index might start | |
329 \&\*(L"too early\*(R". This is consistent with the goal of the module | |
330 of matching \*(L"as early as possible\*(R", just like regular expressions | |
331 (that there might be a \*(L"less approximate\*(R" match starting later is | |
332 of somewhat irrelevant). | |
333 .PP | |
334 There's also backwards-scanning \f(CW\*(C`arindex()\*(C'\fR. | |
335 .SH "SLICE" | |
336 .IX Header "SLICE" | |
337 .Vb 1 | |
338 \& use String::Approx \*(Aqaslice\*(Aq; | |
339 \& | |
340 \& ($index, $size) = aslice("pattern") | |
341 \& ([$i0, $s0], ...) = aslice("pattern", @inputs) | |
342 \& ($index, $size) = aslice("pattern", [ modifiers ]) | |
343 \& ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
344 .Ve | |
345 .PP | |
346 Like \f(CW\*(C`aindex()\*(C'\fR but returns also the size (length) of the match. | |
347 If the match fails, returns an empty list (when matching against \f(CW$_\fR) | |
348 or an empty anonymous list corresponding to the particular input. | |
349 .PP | |
350 \&\s-1NOTE:\s0 size of the match will very probably be something you did not | |
351 expect (such as longer than the pattern, or a negative number). This | |
352 may or may not be fixed in future releases. Also the beginning of the | |
353 match may vary from the expected as with \fIaindex()\fR, see above. | |
354 .PP | |
355 If the modifier | |
356 .PP | |
357 .Vb 1 | |
358 \& "minimal_distance" | |
359 .Ve | |
360 .PP | |
361 is used, the minimal possible edit distance is returned as the | |
362 third element: | |
363 .PP | |
364 .Vb 2 | |
365 \& ($index, $size, $distance) = aslice("pattern", [ modifiers ]) | |
366 \& ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
367 .Ve | |
368 .SH "DISTANCE" | |
369 .IX Header "DISTANCE" | |
370 .Vb 1 | |
371 \& use String::Approx \*(Aqadist\*(Aq; | |
372 \& | |
373 \& $dist = adist("pattern", $input); | |
374 \& @dist = adist("pattern", @input); | |
375 .Ve | |
376 .PP | |
377 Return the \fIedit distance\fR or distances between the pattern and the | |
378 input or inputs. Zero edit distance means exact match. (Remember | |
379 that the match can 'float' in the inputs, the match is a substring | |
380 match.) If the pattern is longer than the input or inputs, the | |
381 returned distance or distances is or are negative. | |
382 .PP | |
383 .Vb 1 | |
384 \& use String::Approx \*(Aqadistr\*(Aq; | |
385 \& | |
386 \& $dist = adistr("pattern", $input); | |
387 \& @dist = adistr("pattern", @inputs); | |
388 .Ve | |
389 .PP | |
390 Return the \fBrelative\fR \fIedit distance\fR or distances between the | |
391 pattern and the input or inputs. Zero relative edit distance means | |
392 exact match, one means completely different. (Remember that the | |
393 match can 'float' in the inputs, the match is a substring match.) If | |
394 the pattern is longer than the input or inputs, the returned distance | |
395 or distances is or are negative. | |
396 .PP | |
397 You can use \fIadist()\fR or \fIadistr()\fR to sort the inputs according to their | |
398 approximateness: | |
399 .PP | |
400 .Vb 3 | |
401 \& my %d; | |
402 \& @d{@inputs} = map { abs } adistr("pattern", @inputs); | |
403 \& my @d = sort { $d{$a} <=> $d{$b} } @inputs; | |
404 .Ve | |
405 .PP | |
406 Now \f(CW@d\fR contains the inputs, the most like \f(CW"pattern"\fR first. | |
407 .SH "CONTROLLING THE CACHE" | |
408 .IX Header "CONTROLLING THE CACHE" | |
409 \&\f(CW\*(C`String::Approx\*(C'\fR maintains a \s-1LU\s0 (least-used) cache that holds the | |
410 \&'matching engines' for each instance of a \fIpattern+modifiers\fR. The | |
411 cache is intended to help the case where you match a small set of | |
412 patterns against a large set of string. However, the more engines you | |
413 cache the more you eat memory. If you have a lot of different | |
414 patterns or if you have a lot of memory to burn, you may want to | |
415 control the cache yourself. For example, allowing a larger cache | |
416 consumes more memory but probably runs a little bit faster since the | |
417 cache fills (and needs flushing) less often. | |
418 .PP | |
419 The cache has two parameters: \fImax\fR and \fIpurge\fR. The first one | |
420 is the maximum size of the cache and the second one is the cache | |
421 flushing ratio: when the number of cache entries exceeds \fImax\fR, | |
422 \&\fImax\fR times \fIpurge\fR cache entries are flushed. The default | |
423 values are 1000 and 0.75, respectively, which means that when | |
424 the 1001st entry would be cached, 750 least used entries will | |
425 be removed from the cache. To access the parameters you can | |
426 use the calls | |
427 .PP | |
428 .Vb 2 | |
429 \& $now_max = String::Approx::cache_max(); | |
430 \& String::Approx::cache_max($new_max); | |
431 \& | |
432 \& $now_purge = String::Approx::cache_purge(); | |
433 \& String::Approx::cache_purge($new_purge); | |
434 \& | |
435 \& $limit = String::Approx::cache_n_purge(); | |
436 .Ve | |
437 .PP | |
438 To be honest, there are actually \fBtwo\fR caches: the first one is used | |
439 far the patterns with no modifiers, the second one for the patterns | |
440 with pattern modifiers. Using the standard parameters you will | |
441 therefore actually cache up to 2000 entries. The above calls control | |
442 both caches for the same price. | |
443 .PP | |
444 To disable caching completely use | |
445 .PP | |
446 .Vb 1 | |
447 \& String::Approx::cache_disable(); | |
448 .Ve | |
449 .PP | |
450 Note that this doesn't flush any possibly existing cache entries, | |
451 to do that use | |
452 .PP | |
453 .Vb 1 | |
454 \& String::Approx::cache_flush_all(); | |
455 .Ve | |
456 .SH "NOTES" | |
457 .IX Header "NOTES" | |
458 Because matching is by \fIsubstrings\fR, not by whole strings, insertions | |
459 and substitutions produce often very similar results: \*(L"abcde\*(R" matches | |
460 \&\*(L"axbcde\*(R" either by insertion \fBor\fR substitution of \*(L"x\*(R". | |
461 .PP | |
462 The maximum edit distance is also the maximum number of edits. | |
463 That is, the \fB\*(L"I2\*(R"\fR in | |
464 .PP | |
465 .Vb 1 | |
466 \& amatch("abcd", ["I2"]) | |
467 .Ve | |
468 .PP | |
469 is useless because the maximum edit distance is (implicitly) 1. | |
470 You may have meant to say | |
471 .PP | |
472 .Vb 1 | |
473 \& amatch("abcd", ["2D1S1"]) | |
474 .Ve | |
475 .PP | |
476 or something like that. | |
477 .PP | |
478 If you want to simulate transposes | |
479 .PP | |
480 .Vb 1 | |
481 \& feet fete | |
482 .Ve | |
483 .PP | |
484 you need to allow at least edit distance of two because in terms of | |
485 our edit primitives a transpose is first one deletion and then one | |
486 insertion. | |
487 .SS "\s-1TEXT\s0 \s-1POSITION\s0" | |
488 .IX Subsection "TEXT POSITION" | |
489 The starting and ending positions of matching, substituting, indexing, or | |
490 slicing can be changed from the beginning and end of the input(s) to | |
491 some other positions by using either or both of the modifiers | |
492 .PP | |
493 .Vb 2 | |
494 \& "initial_position=24" | |
495 \& "final_position=42" | |
496 .Ve | |
497 .PP | |
498 or the both the modifiers | |
499 .PP | |
500 .Vb 2 | |
501 \& "initial_position=24" | |
502 \& "position_range=10" | |
503 .Ve | |
504 .PP | |
505 By setting the \fB\*(L"position_range\*(R"\fR to be zero you can limit | |
506 (anchor) the operation to happen only once (if a match is possible) | |
507 at the position. | |
508 .SH "VERSION" | |
509 .IX Header "VERSION" | |
510 Major release 3. | |
511 .SH "CHANGES FROM VERSION 2" | |
512 .IX Header "CHANGES FROM VERSION 2" | |
513 .SS "\s-1GOOD\s0 \s-1NEWS\s0" | |
514 .IX Subsection "GOOD NEWS" | |
515 .IP "The version 3 is 2\-3 times faster than version 2" 4 | |
516 .IX Item "The version 3 is 2-3 times faster than version 2" | |
517 .PD 0 | |
518 .IP "No pattern length limitation" 4 | |
519 .IX Item "No pattern length limitation" | |
520 .PD | |
521 The algorithm is independent on the pattern length: its time | |
522 complexity is \fIO(kn)\fR, where \fIk\fR is the number of edits and \fIn\fR the | |
523 length of the text (input). The preprocessing of the pattern will of | |
524 course take some \fIO(m)\fR (\fIm\fR being the pattern length) time, but | |
525 \&\f(CW\*(C`amatch()\*(C'\fR and \f(CW\*(C`asubstitute()\*(C'\fR cache the result of this | |
526 preprocessing so that it is done only once per pattern. | |
527 .SS "\s-1BAD\s0 \s-1NEWS\s0" | |
528 .IX Subsection "BAD NEWS" | |
529 .IP "You do need a C compiler to install the module" 4 | |
530 .IX Item "You do need a C compiler to install the module" | |
531 Perl's regular expressions are no more used; instead a faster and more | |
532 scalable algorithm written in C is used. | |
533 .ie n .IP """asubstitute()"" is now always stingy" 4 | |
534 .el .IP "\f(CWasubstitute()\fR is now always stingy" 4 | |
535 .IX Item "asubstitute() is now always stingy" | |
536 The string matched and substituted is now always stingy, as short | |
537 as possible. It used to be as long as possible. This is an unfortunate | |
538 change stemming from switching the matching algorithm. Example: with | |
539 edit distance of two and substituting for \fB\*(L"word\*(R"\fR from \fB\*(L"cork\*(R"\fR and | |
540 \&\fB\*(L"wool\*(R"\fR previously did match \fB\*(L"cork\*(R"\fR and \fB\*(L"wool\*(R"\fR. Now it does | |
541 match \fB\*(L"or\*(R"\fR and \fB\*(L"wo\*(R"\fR. As little as possible, or, in other words, | |
542 with as much approximateness, as many edits, as possible. Because | |
543 there is no \fIneed\fR to match the \fB\*(L"c\*(R"\fR of \fB\*(L"cork\*(R"\fR, it is not matched. | |
544 .ie n .IP "no more ""aregex()"" because regular expressions are no more used" 4 | |
545 .el .IP "no more \f(CWaregex()\fR because regular expressions are no more used" 4 | |
546 .IX Item "no more aregex() because regular expressions are no more used" | |
547 .PD 0 | |
548 .ie n .IP "no more ""compat1"" for String::Approx version 1 compatibility" 4 | |
549 .el .IP "no more \f(CWcompat1\fR for String::Approx version 1 compatibility" 4 | |
550 .IX Item "no more compat1 for String::Approx version 1 compatibility" | |
551 .PD | |
552 .SH "ACKNOWLEDGEMENTS" | |
553 .IX Header "ACKNOWLEDGEMENTS" | |
554 The following people have provided valuable test cases, documentation | |
555 clarifications, and other feedback: | |
556 .PP | |
557 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz, | |
558 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda, | |
559 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner, | |
560 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton, | |
561 \&'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley, | |
562 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman, | |
563 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park, | |
564 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic, | |
565 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin, | |
566 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith, | |
567 Frank Tobin, Greg Ward, Rich Williams, Rick Wise. | |
568 .PP | |
569 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra | |
570 Gopal in the Department of Computer Science, University of Arizona. | |
571 .SH "AUTHOR" | |
572 .IX Header "AUTHOR" | |
573 Jarkko Hietaniemi <jhi@iki.fi> | |
574 .SH "COPYRIGHT AND LICENSE" | |
575 .IX Header "COPYRIGHT AND LICENSE" | |
576 Copyright 2001\-2013 by Jarkko Hietaniemi | |
577 .PP | |
578 This library is free software; you can redistribute it and/or modify | |
579 under either the terms of the Artistic License 2.0, or the \s-1GNU\s0 Library | |
580 General Public License, Version 2. See the files Artistic and \s-1LGPL\s0 | |
581 for more details. | |
582 .PP | |
583 Furthermore: no warranties or obligations of any kind are given, and | |
584 the separate file \fI\s-1COPYRIGHT\s0\fR must be included intact in all copies | |
585 and derived materials. |