Skip to content
Snippets Groups Projects
Commit 2626ee0c authored by chris's avatar chris
Browse files

more utf8_substr improvements (re FS#891 and yesterday's patch)

- rework utf8_substr() NOMBSTRING code to always use pcre
- remove work around for utf8_substr() and large strings from ft_snippet()

darcs-hash:20060928165122-9b6ab-0eefc216f07f9d7e7d8eb62ce26605c28ee340fa.gz
parent d07dd8ee
No related branches found
No related tags found
No related merge requests found
......@@ -328,10 +328,6 @@ switch ($algorithm) {
list($str,$idx) = $match[0];
// is it ok to use utf8_substr() -- see bug #891,
// check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters)
if ($idx <= 65135) {
// convert $idx (a byte offset) into a utf8 character offset
$utf8_idx = utf8_strlen(substr($text,0,$idx));
$utf8_len = utf8_strlen($str);
......@@ -339,55 +335,42 @@ switch ($algorithm) {
// establish context, 100 bytes surrounding the match string
// first look to see if we can go 100 either side,
// then drop to 50 adding any excess if the other side can't go to 50,
$pre = min($utf8_idx-$utf8_offset,100);
$post = min($len-$utf8_idx-$utf8_len,100);
if ($pre>50 && $post>50) {
$pre = $post = 50;
} else if ($pre>50) {
$pre = min($pre,100-$post);
} else if ($post>50) {
$post = min($post, 100-$pre);
} else {
// both are less than 50, means the context is the whole string
// make it so and break out of this loop - there is no need for the complex snippet calculations
$snippets = array($text);
break;
}
// establish context start and end points, try to append to previous context if possible
$start = $utf8_idx - $pre;
$append = ($start < $end) ? $end : false; // still the end of the previous context snippet
$end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
$pre = min($utf8_idx-$utf8_offset,100);
$post = min($len-$utf8_idx-$utf8_len,100);
if ($append) {
$snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
} else {
$snippets[] = utf8_substr($text,$start,$end-$start);
}
// set $offset for next match attempt
// substract strlen to avoid splitting a potential search success, this is an approximation as the
// search pattern may match strings of varying length and it will fail if the context snippet
// boundary breaks a matching string longer than the current match
$utf8_offset = $utf8_idx + $post;
$offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
$offset = utf8_correctIdx($text,$offset);
if ($pre>50 && $post>50) {
$pre = $post = 50;
} else if ($pre>50) {
$pre = min($pre,100-$post);
} else if ($post>50) {
$post = min($post, 100-$pre);
} else {
// code for strings too large for utf8_substr
// use a larger context number as its bytes not characters
// no need to check for short pre, $idx is nearly 64k
$post = min(strlen($text)-$idx-strlen($str), 70);
$pre = ($post < 70) ? 140 - $post : 70;
// both are less than 50, means the context is the whole string
// make it so and break out of this loop - there is no need for the complex snippet calculations
$snippets = array($text);
break;
}
$start = utf8_correctIdx($text,$idx - $pre);
$end = utf8_correctIdx($text, $idx + strlen($str) + $post);
// establish context start and end points, try to append to previous context if possible
$start = $utf8_idx - $pre;
$append = ($start < $end) ? $end : false; // still the end of the previous context snippet
$end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
$snippets[] = substr($text,$start,$end-$start);
$offset = $end - strlen($str);
if ($append) {
$snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
} else {
$snippets[] = utf8_substr($text,$start,$end-$start);
}
// set $offset for next match attempt
// substract strlen to avoid splitting a potential search success, this is an approximation as the
// search pattern may match strings of varying length and it will fail if the context snippet
// boundary breaks a matching string longer than the current match
$utf8_offset = $utf8_idx + $post;
$offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
$offset = utf8_correctIdx($text,$offset);
}
$m = "\1";
$snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
$snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));
......
......@@ -127,8 +127,6 @@ function utf8_strlen($string){
* UTF-8 aware alternative to substr
*
* Return part of a string given character offset (and optionally length)
* Note: supports use of negative offsets and lengths but will be slower
* when doing so
*
* @author Harry Fuecks <hfuecks@gmail.com>
* @author Chris Smith <chris@jalakai.co.uk>
......@@ -146,61 +144,86 @@ function utf8_substr($str, $offset, $length = null) {
}
}
if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) {
if ( $length === null ) {
$length = '*';
} else {
$strlen = strlen(utf8_decode($str));
if ( $offset > $strlen ) {
return '';
}
if ( ( $offset + $length ) > $strlen ) {
$length = '*';
} else {
$length = '{'.$length.'}';
}
}
/*
* Notes:
*
* no mb string support, so we'll use pcre regex's with 'u' flag
* pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
* offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
*
* substr documentation states false can be returned in some cases (e.g. offset > string length)
* mb_substr never returns false, it will return an empty string instead.
*
* calculating the number of characters in the string is a relatively expensive operation, so
* we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
*/
// cast parameters to appropriate types to avoid multiple notices/warnings
$str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
$offset = (int)$offset;
if (!is_null($length)) $length = (int)$length;
// handle trivial cases
if ($length === 0) return '';
if ($offset < 0 && $length < 0 && $length < $offset) return '';
$offset_pattern = '';
$length_pattern = '';
// normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
if ($offset < 0) {
$strlen = strlen(utf8_decode($str)); // see notes
$offset = $strlen + $offset;
if ($offset < 0) $offset = 0;
}
$pattern = '/^.{'.$offset.'}(.'.$length.')/us';
preg_match($pattern, $str, $matches);
// establish a pattern for offset, a non-captured group equal in length to offset
if ($offset > 0) {
$Ox = (int)($offset/65535);
$Oy = $offset%65535;
if ( isset($matches[1]) ) {
return $matches[1];
}
return false;
if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
$offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
} else {
$offset_pattern = '^'; // offset == 0; just anchor the pattern
}
// establish a pattern for length
if (is_null($length)) {
$length_pattern = '(.*)$'; // the rest of the string
} else {
// convert character offsets to byte offsets and use normal substr()
// 1. normalise paramters into positive offset and length and carry out simple checks
$strlen = strlen(utf8_decode($str));
if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes
if ($offset > $strlen) return ''; // another trivial case
if ($offset < 0) {
$offset = max($strlen+$offset,0);
}
if ($offset >= $strlen) return false;
if ($length > 0) {
if ($length === null) {
// 2a. convert to start byte offset
list($start) = _utf8_byteindex($str,$offset);
return substr($str,$start);
}
$length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string
if ($length < 0) {
$length = $strlen-$offset+$length;
if ($length < 0) return '';
}
$Lx = (int)($length/65535);
$Ly = $length%65535;
// +ve length requires ... a captured group of length characters
if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
$length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
} else if ($length < 0) {
if ($length === 0) return '';
if ($strlen - $offset < $length) $length = $strlen-$offset;
if ($length < ($offset - $strlen)) return '';
// 2b. convert to start and end byte offsets
list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length);
return substr($str,$start,$end-$start);
$Lx = (int)((-$length)/65535);
$Ly = (-$length)%65535;
// -ve length requires ... capture everything except a group of -length characters
// anchored at the tail-end of the string
if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
$length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
}
}
}
if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
return $match[1];
}
/**
* Unicode aware replacement for substr_replace()
......@@ -815,69 +838,6 @@ function utf8_correctIdx(&$str,$i,$next=false) {
return $i;
}
/**
* determine the byte indexes into a utf-8 string for one or more character offsets
* PRIVATE (could be made public with proper paramter checking)
*
* @author Chris Smith <chris@jalakai.co.uk>
*
* @param string $str utf8 string
* @param int $offset any number of character offsets into $str
*
* @return array byte indexes into $str, one index for each offset argument
*/
function _utf8_byteindex() {
$args = func_get_args();
$str =& array_shift($args);
if (!is_string($str)) return false;
$result = array();
// use a short piece of str to estimate bytes per character
$i = utf8_correctIdx($str, 300, true); // $i (& $j) -> byte indexes into $str
$c = utf8_strlen(substr($str,0,$i)); // $c -> character offset into $str
sort($args); // deal with arguments from lowest to highest
foreach ($args as $offset) {
// sanity checks FIXME
// 0 is an easy check
if ($offset == 0) { $result[] = 0; continue; }
$safety_valve = 50; // ensure no endless looping
do {
$j = (int)($offset * $i/$c); // apply latest bytes/character estimate to offset
$j = utf8_correctIdx($str, $j, true); // correct to utf8 character boundary
if ($j > $i) {
$c += utf8_strlen(substr($str,$i,$j-$i)); // determine new character offset
} else {
$c -= utf8_strlen(substr($str,$j,$i-$j)); // ditto
}
$error = abs($c-$offset);
$i = $j; // ready for next time around
} while (($error > 7) && --$safety_valve) ; // from 7 it is faster to iterate over the string
if ($error && $error <= 7) {
if ($c < $offset) {
// move up
while ($error--) { $i = utf8_correctIdx($str,++$i,true); }
} else {
// move down
while ($error--) { $i = utf8_correctIdx($str,--$i,false); }
}
$c = $offset; // ready for next arg
}
$result[] = $i;
}
return $result;
}
// only needed if no mb_string available
if(!UTF8_MBSTRING){
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment