13
|
1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16)
|
|
2 .\"
|
|
3 .\" Standard preamble:
|
|
4 .\" ========================================================================
|
|
5 .de Sp \" Vertical space (when we can't use .PP)
|
|
6 .if t .sp .5v
|
|
7 .if n .sp
|
|
8 ..
|
|
9 .de Vb \" Begin verbatim text
|
|
10 .ft CW
|
|
11 .nf
|
|
12 .ne \\$1
|
|
13 ..
|
|
14 .de Ve \" End verbatim text
|
|
15 .ft R
|
|
16 .fi
|
|
17 ..
|
|
18 .\" Set up some character translations and predefined strings. \*(-- will
|
|
19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
|
|
20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will
|
|
21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and
|
|
22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
|
|
23 .\" nothing in troff, for use with C<>.
|
|
24 .tr \(*W-
|
|
25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
|
|
26 .ie n \{\
|
|
27 . ds -- \(*W-
|
|
28 . ds PI pi
|
|
29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
|
|
30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
|
|
31 . ds L" ""
|
|
32 . ds R" ""
|
|
33 . ds C` ""
|
|
34 . ds C' ""
|
|
35 'br\}
|
|
36 .el\{\
|
|
37 . ds -- \|\(em\|
|
|
38 . ds PI \(*p
|
|
39 . ds L" ``
|
|
40 . ds R" ''
|
|
41 'br\}
|
|
42 .\"
|
|
43 .\" Escape single quotes in literal strings from groff's Unicode transform.
|
|
44 .ie \n(.g .ds Aq \(aq
|
|
45 .el .ds Aq '
|
|
46 .\"
|
|
47 .\" If the F register is turned on, we'll generate index entries on stderr for
|
|
48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
|
|
49 .\" entries marked with X<> in POD. Of course, you'll have to process the
|
|
50 .\" output yourself in some meaningful fashion.
|
|
51 .ie \nF \{\
|
|
52 . de IX
|
|
53 . tm Index:\\$1\t\\n%\t"\\$2"
|
|
54 ..
|
|
55 . nr % 0
|
|
56 . rr F
|
|
57 .\}
|
|
58 .el \{\
|
|
59 . de IX
|
|
60 ..
|
|
61 .\}
|
|
62 .\"
|
|
63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
|
|
64 .\" Fear. Run. Save yourself. No user-serviceable parts.
|
|
65 . \" fudge factors for nroff and troff
|
|
66 .if n \{\
|
|
67 . ds #H 0
|
|
68 . ds #V .8m
|
|
69 . ds #F .3m
|
|
70 . ds #[ \f1
|
|
71 . ds #] \fP
|
|
72 .\}
|
|
73 .if t \{\
|
|
74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
|
|
75 . ds #V .6m
|
|
76 . ds #F 0
|
|
77 . ds #[ \&
|
|
78 . ds #] \&
|
|
79 .\}
|
|
80 . \" simple accents for nroff and troff
|
|
81 .if n \{\
|
|
82 . ds ' \&
|
|
83 . ds ` \&
|
|
84 . ds ^ \&
|
|
85 . ds , \&
|
|
86 . ds ~ ~
|
|
87 . ds /
|
|
88 .\}
|
|
89 .if t \{\
|
|
90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
|
|
91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
|
|
92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
|
|
93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
|
|
94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
|
|
95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
|
|
96 .\}
|
|
97 . \" troff and (daisy-wheel) nroff accents
|
|
98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
|
|
99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
|
|
100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
|
|
101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
|
|
102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
|
|
103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
|
|
104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
|
|
105 .ds ae a\h'-(\w'a'u*4/10)'e
|
|
106 .ds Ae A\h'-(\w'A'u*4/10)'E
|
|
107 . \" corrections for vroff
|
|
108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
|
|
109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
|
|
110 . \" for low resolution devices (crt and lpr)
|
|
111 .if \n(.H>23 .if \n(.V>19 \
|
|
112 \{\
|
|
113 . ds : e
|
|
114 . ds 8 ss
|
|
115 . ds o a
|
|
116 . ds d- d\h'-1'\(ga
|
|
117 . ds D- D\h'-1'\(hy
|
|
118 . ds th \o'bp'
|
|
119 . ds Th \o'LP'
|
|
120 . ds ae ae
|
|
121 . ds Ae AE
|
|
122 .\}
|
|
123 .rm #[ #] #H #V #F C
|
|
124 .\" ========================================================================
|
|
125 .\"
|
|
126 .IX Title "Approx 3pm"
|
|
127 .TH Approx 3pm "2013-01-22" "perl v5.14.2" "User Contributed Perl Documentation"
|
|
128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes
|
|
129 .\" way too many mistakes in technical documents.
|
|
130 .if n .ad l
|
|
131 .nh
|
|
132 .SH "NAME"
|
|
133 String::Approx \- Perl extension for approximate matching (fuzzy matching)
|
|
134 .SH "SYNOPSIS"
|
|
135 .IX Header "SYNOPSIS"
|
|
136 .Vb 1
|
|
137 \& use String::Approx \*(Aqamatch\*(Aq;
|
|
138 \&
|
|
139 \& print if amatch("foobar");
|
|
140 \&
|
|
141 \& my @matches = amatch("xyzzy", @inputs);
|
|
142 \&
|
|
143 \& my @catches = amatch("plugh", [\*(Aq2\*(Aq], @inputs);
|
|
144 .Ve
|
|
145 .SH "DESCRIPTION"
|
|
146 .IX Header "DESCRIPTION"
|
|
147 String::Approx lets you match and substitute strings approximately.
|
|
148 With this you can emulate errors: typing errorrs, speling errors,
|
|
149 closely related vocabularies (colour color), genetic mutations (\s-1GAG\s0
|
|
150 \&\s-1ACT\s0), abbreviations (McScot, MacScot).
|
|
151 .PP
|
|
152 \&\s-1NOTE:\s0 String::Approx suits the task of \fBstring matching\fR, not
|
|
153 \&\fBstring comparison\fR, and it works for \fBstrings\fR, not for \fBtext\fR.
|
|
154 .PP
|
|
155 If you want to compare strings for similarity, you probably just want
|
|
156 the Levenshtein edit distance (explained below), the Text::Levenshtein
|
|
157 and Text::LevenshteinXS modules in \s-1CPAN\s0. See also Text::WagnerFischer
|
|
158 and Text::PhraseDistance. (There are functions for this in String::Approx,
|
|
159 e.g. \fIadist()\fR, but their results sometimes differ from the bare Levenshtein
|
|
160 et al.)
|
|
161 .PP
|
|
162 If you want to compare things like text or source code, consisting of
|
|
163 \&\fBwords\fR or \fBtokens\fR and \fBphrases\fR and \fBsentences\fR, or
|
|
164 \&\fBexpressions\fR and \fBstatements\fR, you should probably use some other
|
|
165 tool than String::Approx, like for example the standard \s-1UNIX\s0 \fIdiff\fR\|(1)
|
|
166 tool, or the Algorithm::Diff module from \s-1CPAN\s0.
|
|
167 .PP
|
|
168 The measure of \fBapproximateness\fR is the \fILevenshtein edit distance\fR.
|
|
169 It is the total number of \*(L"edits\*(R": insertions,
|
|
170 .PP
|
|
171 .Vb 1
|
|
172 \& word world
|
|
173 .Ve
|
|
174 .PP
|
|
175 deletions,
|
|
176 .PP
|
|
177 .Vb 1
|
|
178 \& monkey money
|
|
179 .Ve
|
|
180 .PP
|
|
181 and substitutions
|
|
182 .PP
|
|
183 .Vb 1
|
|
184 \& sun fun
|
|
185 .Ve
|
|
186 .PP
|
|
187 required to transform a string to another string. For example, to
|
|
188 transform \fI\*(L"lead\*(R"\fR into \fI\*(L"gold\*(R"\fR, you need three edits:
|
|
189 .PP
|
|
190 .Vb 1
|
|
191 \& lead gead goad gold
|
|
192 .Ve
|
|
193 .PP
|
|
194 The edit distance of \*(L"lead\*(R" and \*(L"gold\*(R" is therefore three, or 75%.
|
|
195 .PP
|
|
196 \&\fBString::Approx\fR uses the Levenshtein edit distance as its measure, but
|
|
197 String::Approx is not well-suited for comparing strings of different
|
|
198 length, in other words, if you want a \*(L"fuzzy eq\*(R", see above.
|
|
199 String::Approx is more like regular expressions or \fIindex()\fR, it finds
|
|
200 substrings that are close matches.>
|
|
201 .SH "MATCH"
|
|
202 .IX Header "MATCH"
|
|
203 .Vb 1
|
|
204 \& use String::Approx \*(Aqamatch\*(Aq;
|
|
205 \&
|
|
206 \& $matched = amatch("pattern")
|
|
207 \& $matched = amatch("pattern", [ modifiers ])
|
|
208 \&
|
|
209 \& $any_matched = amatch("pattern", @inputs)
|
|
210 \& $any_matched = amatch("pattern", [ modifiers ], @inputs)
|
|
211 \&
|
|
212 \& @match = amatch("pattern")
|
|
213 \& @match = amatch("pattern", [ modifiers ])
|
|
214 \&
|
|
215 \& @matches = amatch("pattern", @inputs)
|
|
216 \& @matches = amatch("pattern", [ modifiers ], @inputs)
|
|
217 .Ve
|
|
218 .PP
|
|
219 Match \fBpattern\fR approximately. In list context return the matched
|
|
220 \&\fB\f(CB@inputs\fB\fR. If no inputs are given, match against the \fB\f(CB$_\fB\fR. In scalar
|
|
221 context return true if \fIany\fR of the inputs match, false if none match.
|
|
222 .PP
|
|
223 Notice that the pattern is a string. Not a regular expression. None
|
|
224 of the regular expression notations (^, ., *, and so on) work. They
|
|
225 are characters just like the others. Note-on-note: some limited form
|
|
226 of \fI\*(L"regular expressionism\*(R"\fR is planned in future: for example
|
|
227 character classes ([abc]) and \fIany-chars\fR (.). But that feature will
|
|
228 be turned on by a special \fImodifier\fR (just a guess: \*(L"r\*(R"), so there
|
|
229 should be no backward compatibility problem.
|
|
230 .PP
|
|
231 Notice also that matching is not symmetric. The inputs are matched
|
|
232 against the pattern, not the other way round. In other words: the
|
|
233 pattern can be a substring, a submatch, of an input element. An input
|
|
234 element is always a superstring of the pattern.
|
|
235 .SS "\s-1MODIFIERS\s0"
|
|
236 .IX Subsection "MODIFIERS"
|
|
237 With the modifiers you can control the amount of approximateness and
|
|
238 certain other control variables. The modifiers are one or more
|
|
239 strings, for example \fB\*(L"i\*(R"\fR, within a string optionally separated by
|
|
240 whitespace. The modifiers are inside an anonymous array: the \fB[ ]\fR
|
|
241 in the syntax are not notational, they really do mean \fB[ ]\fR, for
|
|
242 example \fB[ \*(L"i\*(R", \*(L"2\*(R" ]\fR. \fB[\*(L"2 i\*(R"]\fR would be identical.
|
|
243 .PP
|
|
244 The implicit default approximateness is 10%, rounded up. In other
|
|
245 words: every tenth character in the pattern may be an error, an edit.
|
|
246 You can explicitly set the maximum approximateness by supplying a
|
|
247 modifier like
|
|
248 .PP
|
|
249 .Vb 2
|
|
250 \& number
|
|
251 \& number%
|
|
252 .Ve
|
|
253 .PP
|
|
254 Examples: \fB\*(L"3\*(R"\fR, \fB\*(L"15%\*(R"\fR.
|
|
255 .PP
|
|
256 Note that \f(CW\*(C`0%\*(C'\fR is not rounded up, it is equal to \f(CW0\fR.
|
|
257 .PP
|
|
258 Using a similar syntax you can separately control the maximum number
|
|
259 of insertions, deletions, and substitutions by prefixing the numbers
|
|
260 with I, D, or S, like this:
|
|
261 .PP
|
|
262 .Vb 6
|
|
263 \& Inumber
|
|
264 \& Inumber%
|
|
265 \& Dnumber
|
|
266 \& Dnumber%
|
|
267 \& Snumber
|
|
268 \& Snumber%
|
|
269 .Ve
|
|
270 .PP
|
|
271 Examples: \fB\*(L"I2\*(R"\fR, \fB\*(L"D20%\*(R"\fR, \fB\*(L"S0\*(R"\fR.
|
|
272 .PP
|
|
273 You can ignore case (\fB\*(L"A\*(R"\fR becames equal to \fB\*(L"a\*(R"\fR and vice versa)
|
|
274 by adding the \fB\*(L"i\*(R"\fR modifier.
|
|
275 .PP
|
|
276 For example
|
|
277 .PP
|
|
278 .Vb 1
|
|
279 \& [ "i 25%", "S0" ]
|
|
280 .Ve
|
|
281 .PP
|
|
282 means \fIignore case\fR, \fIallow every fourth character to be \*(L"an edit\*(R"\fR,
|
|
283 but allow \fIno substitutions\fR. (See \s-1NOTES\s0 about disallowing
|
|
284 substitutions or insertions.)
|
|
285 .PP
|
|
286 \&\s-1NOTE:\s0 setting \f(CW\*(C`I0 D0 S0\*(C'\fR is not equivalent to using \fIindex()\fR.
|
|
287 If you want to use \fIindex()\fR, use \fIindex()\fR.
|
|
288 .SH "SUBSTITUTE"
|
|
289 .IX Header "SUBSTITUTE"
|
|
290 .Vb 1
|
|
291 \& use String::Approx \*(Aqasubstitute\*(Aq;
|
|
292 \&
|
|
293 \& @substituted = asubstitute("pattern", "replacement")
|
|
294 \& @substituted = asubstitute("pattern", "replacement", @inputs)
|
|
295 \& @substituted = asubstitute("pattern", "replacement", [ modifiers ])
|
|
296 \& @substituted = asubstitute("pattern", "replacement",
|
|
297 \& [ modifiers ], @inputs)
|
|
298 .Ve
|
|
299 .PP
|
|
300 Substitute approximate \fBpattern\fR with \fBreplacement\fR and return as a
|
|
301 list <copies> of \fB\f(CB@inputs\fB\fR, the substitutions having been made on the
|
|
302 elements that did match the pattern. If no inputs are given,
|
|
303 substitute in the \fB\f(CB$_\fB\fR. The replacement can contain magic strings
|
|
304 \&\fB$&\fR, \fB$`\fR, \fB$'\fR that stand for the matched string, the string
|
|
305 before it, and the string after it, respectively. All the other
|
|
306 arguments are as in \f(CW\*(C`amatch()\*(C'\fR, plus one additional modifier, \fB\*(L"g\*(R"\fR
|
|
307 which means substitute globally (all the matches in an element and not
|
|
308 just the first one, as is the default).
|
|
309 .PP
|
|
310 See \*(L"\s-1BAD\s0 \s-1NEWS\s0\*(R" about the unfortunate stinginess of \f(CW\*(C`asubstitute()\*(C'\fR.
|
|
311 .SH "INDEX"
|
|
312 .IX Header "INDEX"
|
|
313 .Vb 1
|
|
314 \& use String::Approx \*(Aqaindex\*(Aq;
|
|
315 \&
|
|
316 \& $index = aindex("pattern")
|
|
317 \& @indices = aindex("pattern", @inputs)
|
|
318 \& $index = aindex("pattern", [ modifiers ])
|
|
319 \& @indices = aindex("pattern", [ modifiers ], @inputs)
|
|
320 .Ve
|
|
321 .PP
|
|
322 Like \f(CW\*(C`amatch()\*(C'\fR but returns the index/indices at which the pattern
|
|
323 matches approximately. In list context and if \f(CW@inputs\fR are used,
|
|
324 returns a list of indices, one index for each input element.
|
|
325 If there's no approximate match, \f(CW\*(C`\-1\*(C'\fR is returned as the index.
|
|
326 .PP
|
|
327 \&\s-1NOTE:\s0 if there is character repetition (e.g. \*(L"aa\*(R") either in
|
|
328 the pattern or in the text, the returned index might start
|
|
329 \&\*(L"too early\*(R". This is consistent with the goal of the module
|
|
330 of matching \*(L"as early as possible\*(R", just like regular expressions
|
|
331 (that there might be a \*(L"less approximate\*(R" match starting later is
|
|
332 of somewhat irrelevant).
|
|
333 .PP
|
|
334 There's also backwards-scanning \f(CW\*(C`arindex()\*(C'\fR.
|
|
335 .SH "SLICE"
|
|
336 .IX Header "SLICE"
|
|
337 .Vb 1
|
|
338 \& use String::Approx \*(Aqaslice\*(Aq;
|
|
339 \&
|
|
340 \& ($index, $size) = aslice("pattern")
|
|
341 \& ([$i0, $s0], ...) = aslice("pattern", @inputs)
|
|
342 \& ($index, $size) = aslice("pattern", [ modifiers ])
|
|
343 \& ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs)
|
|
344 .Ve
|
|
345 .PP
|
|
346 Like \f(CW\*(C`aindex()\*(C'\fR but returns also the size (length) of the match.
|
|
347 If the match fails, returns an empty list (when matching against \f(CW$_\fR)
|
|
348 or an empty anonymous list corresponding to the particular input.
|
|
349 .PP
|
|
350 \&\s-1NOTE:\s0 size of the match will very probably be something you did not
|
|
351 expect (such as longer than the pattern, or a negative number). This
|
|
352 may or may not be fixed in future releases. Also the beginning of the
|
|
353 match may vary from the expected as with \fIaindex()\fR, see above.
|
|
354 .PP
|
|
355 If the modifier
|
|
356 .PP
|
|
357 .Vb 1
|
|
358 \& "minimal_distance"
|
|
359 .Ve
|
|
360 .PP
|
|
361 is used, the minimal possible edit distance is returned as the
|
|
362 third element:
|
|
363 .PP
|
|
364 .Vb 2
|
|
365 \& ($index, $size, $distance) = aslice("pattern", [ modifiers ])
|
|
366 \& ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs)
|
|
367 .Ve
|
|
368 .SH "DISTANCE"
|
|
369 .IX Header "DISTANCE"
|
|
370 .Vb 1
|
|
371 \& use String::Approx \*(Aqadist\*(Aq;
|
|
372 \&
|
|
373 \& $dist = adist("pattern", $input);
|
|
374 \& @dist = adist("pattern", @input);
|
|
375 .Ve
|
|
376 .PP
|
|
377 Return the \fIedit distance\fR or distances between the pattern and the
|
|
378 input or inputs. Zero edit distance means exact match. (Remember
|
|
379 that the match can 'float' in the inputs, the match is a substring
|
|
380 match.) If the pattern is longer than the input or inputs, the
|
|
381 returned distance or distances is or are negative.
|
|
382 .PP
|
|
383 .Vb 1
|
|
384 \& use String::Approx \*(Aqadistr\*(Aq;
|
|
385 \&
|
|
386 \& $dist = adistr("pattern", $input);
|
|
387 \& @dist = adistr("pattern", @inputs);
|
|
388 .Ve
|
|
389 .PP
|
|
390 Return the \fBrelative\fR \fIedit distance\fR or distances between the
|
|
391 pattern and the input or inputs. Zero relative edit distance means
|
|
392 exact match, one means completely different. (Remember that the
|
|
393 match can 'float' in the inputs, the match is a substring match.) If
|
|
394 the pattern is longer than the input or inputs, the returned distance
|
|
395 or distances is or are negative.
|
|
396 .PP
|
|
397 You can use \fIadist()\fR or \fIadistr()\fR to sort the inputs according to their
|
|
398 approximateness:
|
|
399 .PP
|
|
400 .Vb 3
|
|
401 \& my %d;
|
|
402 \& @d{@inputs} = map { abs } adistr("pattern", @inputs);
|
|
403 \& my @d = sort { $d{$a} <=> $d{$b} } @inputs;
|
|
404 .Ve
|
|
405 .PP
|
|
406 Now \f(CW@d\fR contains the inputs, the most like \f(CW"pattern"\fR first.
|
|
407 .SH "CONTROLLING THE CACHE"
|
|
408 .IX Header "CONTROLLING THE CACHE"
|
|
409 \&\f(CW\*(C`String::Approx\*(C'\fR maintains a \s-1LU\s0 (least-used) cache that holds the
|
|
410 \&'matching engines' for each instance of a \fIpattern+modifiers\fR. The
|
|
411 cache is intended to help the case where you match a small set of
|
|
412 patterns against a large set of string. However, the more engines you
|
|
413 cache the more you eat memory. If you have a lot of different
|
|
414 patterns or if you have a lot of memory to burn, you may want to
|
|
415 control the cache yourself. For example, allowing a larger cache
|
|
416 consumes more memory but probably runs a little bit faster since the
|
|
417 cache fills (and needs flushing) less often.
|
|
418 .PP
|
|
419 The cache has two parameters: \fImax\fR and \fIpurge\fR. The first one
|
|
420 is the maximum size of the cache and the second one is the cache
|
|
421 flushing ratio: when the number of cache entries exceeds \fImax\fR,
|
|
422 \&\fImax\fR times \fIpurge\fR cache entries are flushed. The default
|
|
423 values are 1000 and 0.75, respectively, which means that when
|
|
424 the 1001st entry would be cached, 750 least used entries will
|
|
425 be removed from the cache. To access the parameters you can
|
|
426 use the calls
|
|
427 .PP
|
|
428 .Vb 2
|
|
429 \& $now_max = String::Approx::cache_max();
|
|
430 \& String::Approx::cache_max($new_max);
|
|
431 \&
|
|
432 \& $now_purge = String::Approx::cache_purge();
|
|
433 \& String::Approx::cache_purge($new_purge);
|
|
434 \&
|
|
435 \& $limit = String::Approx::cache_n_purge();
|
|
436 .Ve
|
|
437 .PP
|
|
438 To be honest, there are actually \fBtwo\fR caches: the first one is used
|
|
439 far the patterns with no modifiers, the second one for the patterns
|
|
440 with pattern modifiers. Using the standard parameters you will
|
|
441 therefore actually cache up to 2000 entries. The above calls control
|
|
442 both caches for the same price.
|
|
443 .PP
|
|
444 To disable caching completely use
|
|
445 .PP
|
|
446 .Vb 1
|
|
447 \& String::Approx::cache_disable();
|
|
448 .Ve
|
|
449 .PP
|
|
450 Note that this doesn't flush any possibly existing cache entries,
|
|
451 to do that use
|
|
452 .PP
|
|
453 .Vb 1
|
|
454 \& String::Approx::cache_flush_all();
|
|
455 .Ve
|
|
456 .SH "NOTES"
|
|
457 .IX Header "NOTES"
|
|
458 Because matching is by \fIsubstrings\fR, not by whole strings, insertions
|
|
459 and substitutions produce often very similar results: \*(L"abcde\*(R" matches
|
|
460 \&\*(L"axbcde\*(R" either by insertion \fBor\fR substitution of \*(L"x\*(R".
|
|
461 .PP
|
|
462 The maximum edit distance is also the maximum number of edits.
|
|
463 That is, the \fB\*(L"I2\*(R"\fR in
|
|
464 .PP
|
|
465 .Vb 1
|
|
466 \& amatch("abcd", ["I2"])
|
|
467 .Ve
|
|
468 .PP
|
|
469 is useless because the maximum edit distance is (implicitly) 1.
|
|
470 You may have meant to say
|
|
471 .PP
|
|
472 .Vb 1
|
|
473 \& amatch("abcd", ["2D1S1"])
|
|
474 .Ve
|
|
475 .PP
|
|
476 or something like that.
|
|
477 .PP
|
|
478 If you want to simulate transposes
|
|
479 .PP
|
|
480 .Vb 1
|
|
481 \& feet fete
|
|
482 .Ve
|
|
483 .PP
|
|
484 you need to allow at least edit distance of two because in terms of
|
|
485 our edit primitives a transpose is first one deletion and then one
|
|
486 insertion.
|
|
487 .SS "\s-1TEXT\s0 \s-1POSITION\s0"
|
|
488 .IX Subsection "TEXT POSITION"
|
|
489 The starting and ending positions of matching, substituting, indexing, or
|
|
490 slicing can be changed from the beginning and end of the input(s) to
|
|
491 some other positions by using either or both of the modifiers
|
|
492 .PP
|
|
493 .Vb 2
|
|
494 \& "initial_position=24"
|
|
495 \& "final_position=42"
|
|
496 .Ve
|
|
497 .PP
|
|
498 or the both the modifiers
|
|
499 .PP
|
|
500 .Vb 2
|
|
501 \& "initial_position=24"
|
|
502 \& "position_range=10"
|
|
503 .Ve
|
|
504 .PP
|
|
505 By setting the \fB\*(L"position_range\*(R"\fR to be zero you can limit
|
|
506 (anchor) the operation to happen only once (if a match is possible)
|
|
507 at the position.
|
|
508 .SH "VERSION"
|
|
509 .IX Header "VERSION"
|
|
510 Major release 3.
|
|
511 .SH "CHANGES FROM VERSION 2"
|
|
512 .IX Header "CHANGES FROM VERSION 2"
|
|
513 .SS "\s-1GOOD\s0 \s-1NEWS\s0"
|
|
514 .IX Subsection "GOOD NEWS"
|
|
515 .IP "The version 3 is 2\-3 times faster than version 2" 4
|
|
516 .IX Item "The version 3 is 2-3 times faster than version 2"
|
|
517 .PD 0
|
|
518 .IP "No pattern length limitation" 4
|
|
519 .IX Item "No pattern length limitation"
|
|
520 .PD
|
|
521 The algorithm is independent on the pattern length: its time
|
|
522 complexity is \fIO(kn)\fR, where \fIk\fR is the number of edits and \fIn\fR the
|
|
523 length of the text (input). The preprocessing of the pattern will of
|
|
524 course take some \fIO(m)\fR (\fIm\fR being the pattern length) time, but
|
|
525 \&\f(CW\*(C`amatch()\*(C'\fR and \f(CW\*(C`asubstitute()\*(C'\fR cache the result of this
|
|
526 preprocessing so that it is done only once per pattern.
|
|
527 .SS "\s-1BAD\s0 \s-1NEWS\s0"
|
|
528 .IX Subsection "BAD NEWS"
|
|
529 .IP "You do need a C compiler to install the module" 4
|
|
530 .IX Item "You do need a C compiler to install the module"
|
|
531 Perl's regular expressions are no more used; instead a faster and more
|
|
532 scalable algorithm written in C is used.
|
|
533 .ie n .IP """asubstitute()"" is now always stingy" 4
|
|
534 .el .IP "\f(CWasubstitute()\fR is now always stingy" 4
|
|
535 .IX Item "asubstitute() is now always stingy"
|
|
536 The string matched and substituted is now always stingy, as short
|
|
537 as possible. It used to be as long as possible. This is an unfortunate
|
|
538 change stemming from switching the matching algorithm. Example: with
|
|
539 edit distance of two and substituting for \fB\*(L"word\*(R"\fR from \fB\*(L"cork\*(R"\fR and
|
|
540 \&\fB\*(L"wool\*(R"\fR previously did match \fB\*(L"cork\*(R"\fR and \fB\*(L"wool\*(R"\fR. Now it does
|
|
541 match \fB\*(L"or\*(R"\fR and \fB\*(L"wo\*(R"\fR. As little as possible, or, in other words,
|
|
542 with as much approximateness, as many edits, as possible. Because
|
|
543 there is no \fIneed\fR to match the \fB\*(L"c\*(R"\fR of \fB\*(L"cork\*(R"\fR, it is not matched.
|
|
544 .ie n .IP "no more ""aregex()"" because regular expressions are no more used" 4
|
|
545 .el .IP "no more \f(CWaregex()\fR because regular expressions are no more used" 4
|
|
546 .IX Item "no more aregex() because regular expressions are no more used"
|
|
547 .PD 0
|
|
548 .ie n .IP "no more ""compat1"" for String::Approx version 1 compatibility" 4
|
|
549 .el .IP "no more \f(CWcompat1\fR for String::Approx version 1 compatibility" 4
|
|
550 .IX Item "no more compat1 for String::Approx version 1 compatibility"
|
|
551 .PD
|
|
552 .SH "ACKNOWLEDGEMENTS"
|
|
553 .IX Header "ACKNOWLEDGEMENTS"
|
|
554 The following people have provided valuable test cases, documentation
|
|
555 clarifications, and other feedback:
|
|
556 .PP
|
|
557 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz,
|
|
558 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda,
|
|
559 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner,
|
|
560 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton,
|
|
561 \&'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley,
|
|
562 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman,
|
|
563 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park,
|
|
564 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic,
|
|
565 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin,
|
|
566 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith,
|
|
567 Frank Tobin, Greg Ward, Rich Williams, Rick Wise.
|
|
568 .PP
|
|
569 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra
|
|
570 Gopal in the Department of Computer Science, University of Arizona.
|
|
571 .SH "AUTHOR"
|
|
572 .IX Header "AUTHOR"
|
|
573 Jarkko Hietaniemi <jhi@iki.fi>
|
|
574 .SH "COPYRIGHT AND LICENSE"
|
|
575 .IX Header "COPYRIGHT AND LICENSE"
|
|
576 Copyright 2001\-2013 by Jarkko Hietaniemi
|
|
577 .PP
|
|
578 This library is free software; you can redistribute it and/or modify
|
|
579 under either the terms of the Artistic License 2.0, or the \s-1GNU\s0 Library
|
|
580 General Public License, Version 2. See the files Artistic and \s-1LGPL\s0
|
|
581 for more details.
|
|
582 .PP
|
|
583 Furthermore: no warranties or obligations of any kind are given, and
|
|
584 the separate file \fI\s-1COPYRIGHT\s0\fR must be included intact in all copies
|
|
585 and derived materials.
|