From 9ee93076cf04a466d2e9620bc3efe538d93e5983 Mon Sep 17 00:00:00 2001 From: chris <chris@jalakai.co.uk> Date: Thu, 31 Aug 2006 02:34:13 +0200 Subject: [PATCH] search improvements ft_snippet() - make utf8 algorithm default - add workaround for utf8_substr() limitations, bug #891 - fix some indexes which missed out on conversion to utf8 character counts - minor improvements idx_lookup() - minor changes to wildcard matching code to improve performance (changes based on profiling results) utf8 - specifically set mb_internal_coding to utf-8 when mb_string functions will be used. darcs-hash:20060831003413-9b6ab-712021eda3c959ffe79d8d3fe91d2c9a8acf2b58.gz --- inc/fulltext.php | 79 ++++++++++++++++++++++++++++++------------------ inc/indexer.php | 9 ++++-- inc/utf8.php | 1 + 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index b9450c172..fa3ec05d2 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -255,7 +255,6 @@ switch ($algorithm) { break; case 'opt2' : - default : // option 2 ... CS 2006-08-25 // above + reduce amount of the file searched $match = array(); @@ -311,15 +310,22 @@ switch ($algorithm) { break; case 'utf8': + default : + $match = array(); $snippets = array(); - $utf8_offset = $offset = 0; + $utf8_offset = $offset = $end = 0; $len = utf8_strlen($text); + for ($cnt=3; $cnt--;) { if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; list($str,$idx) = $match[0]; + // is it ok to use utf8_substr() -- see bug #891, + // check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters) + if ($idx <= 65135) { + // convert $idx (a byte offset) into a utf8 character offset $utf8_idx = utf8_strlen(substr($text,0,$idx)); $utf8_len = utf8_strlen($str); @@ -328,39 +334,54 @@ switch ($algorithm) { // first look to see if we can go 100 either side, // then drop to 50 adding any excess if the other side can't go to 50, // NOTE: these are byte adjustments and will have to be corrected for utf-8 - $pre = min($utf8_idx-$utf8_offset,100); - $post = min($len-$utf8_idx-$utf8_len,100); + $pre = min($utf8_idx-$utf8_offset,100); + $post = min($len-$utf8_idx-$utf8_len,100); + + if ($pre>50 && $post>50) { + $pre = $post = 50; + } else if ($pre>50) { + $pre = min($pre,100-$post); + } else if ($post>50) { + $post = min($post, 100-$pre); + } else { + // both are less than 50, means the context is the whole string + // make it so and break out of this loop - there is no need for the complex snippet calculations + $snippets = array($text); + break; + } - if ($pre>50 && $post>50) { - $pre = $post = 50; - } else if ($pre>50) { - $pre = min($pre,100-$post); - } else if ($post>50) { - $post = min($post, 100-$pre); - } else { - // both are less than 50, means the context is the whole string - // make it so and break out of this loop - there is no need for the complex snippet calculations - $snippets = array($text); - break; - } + // establish context start and end points, try to append to previous context if possible + $start = $utf8_idx - $pre; + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context - // establish context start and end points, try to append to previous context if possible - $start = $idx - $pre; - $append = ($start < $end) ? $end : false; // still the end of the previous context snippet - $end = $idx + $utf8_len + $post; // now set it to the end of this context + if ($append) { + $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + } else { + $snippets[] = utf8_substr($text,$start,$end-$start); + } - if ($append) { - $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + // set $offset for next match attempt + // substract strlen to avoid splitting a potential search success, this is an approximation as the + // search pattern may match strings of varying length and it will fail if the context snippet + // boundary breaks a matching string longer than the current match + $utf8_offset = $utf8_idx + $post; + $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); + $offset = utf8_correctIdx($text,$offset); } else { - $snippets[] = utf8_substr($text,$start,$end-$start); + // code for strings too large for utf8_substr + // use a larger context number as its bytes not characters + $pre = 70; + $post = min(strlen($text)-$idx-strlen($str), 70); + if ($post < 70) { $pre = 70 - $post; } + + $start = utf8_correctIdx($text,$idx - $pre); + $end = utf8_correctIdx($text, $idx + strlen($str) + $post); + + $snippets[] = substr($text,$start,$end-$start); + $offset = $end - strlen($str); } - // set $offset for next match attempt - // substract strlen to avoid splitting a potential search success, this is an approximation as the - // search pattern may match strings of varying length and it will fail if the context snippet - // boundary breaks a matching string longer than the current match - $utf8_offset = $end - $utf8_len; - $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset))); } $m = "\1"; $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); diff --git a/inc/indexer.php b/inc/indexer.php index 9af4b5b84..a2b7a0637 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -233,6 +233,8 @@ function idx_lookup($words){ if(substr($xword,0,1) == '*'){ $xword = substr($xword,1); $wild = 1; + $ptn = '/'.preg_quote($xword,'/').'$/'; +# $l = -1*strlen($xword)-1; } if(substr($xword,-1,1) == '*'){ $xword = substr($xword,0,-1); @@ -245,8 +247,11 @@ function idx_lookup($words){ for($wid=0; $wid<$cnt; $wid++){ $iword = $word_idx[$wid]; if( (($wild==3) && is_int(strpos($iword,$xword))) || - (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) || - (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) +# (($wild==1) && ("$xword\n" == substr($iword,$l))) || + (($wild==1) && preg_match($ptn,$iword)) || +# (($wild==2) && ($xword == substr($iword,0,strlen($xword)))) + (($wild==2) && (0 === strpos($iword,$xword))) + ){ $wids[] = $wid; $result[$word][] = $wid; diff --git a/inc/utf8.php b/inc/utf8.php index aa9594c42..dbf09b6fc 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -13,6 +13,7 @@ if(!defined('UTF8_MBSTRING')){ if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ define('UTF8_MBSTRING',1); + mb_internal_encoding('UTF-8'); }else{ define('UTF8_MBSTRING',0); } -- GitLab