From 9ee93076cf04a466d2e9620bc3efe538d93e5983 Mon Sep 17 00:00:00 2001
From: chris <chris@jalakai.co.uk>
Date: Thu, 31 Aug 2006 02:34:13 +0200
Subject: [PATCH] search improvements

ft_snippet()
- make utf8 algorithm default
- add workaround for utf8_substr() limitations, bug #891
- fix some indexes which missed out on conversion to utf8
  character counts
- minor improvements

idx_lookup()
- minor changes to wildcard matching code to improve performance
  (changes based on profiling results)

utf8
- specifically set mb_internal_coding to utf-8 when mb_string
  functions will be used.

darcs-hash:20060831003413-9b6ab-712021eda3c959ffe79d8d3fe91d2c9a8acf2b58.gz
---
 inc/fulltext.php | 79 ++++++++++++++++++++++++++++++------------------
 inc/indexer.php  |  9 ++++--
 inc/utf8.php     |  1 +
 3 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/inc/fulltext.php b/inc/fulltext.php
index b9450c172..fa3ec05d2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -255,7 +255,6 @@ switch ($algorithm) {
   break;
 
   case 'opt2' :
-  default :
 // option 2 ... CS 2006-08-25
 // above + reduce amount of the file searched
     $match = array();
@@ -311,15 +310,22 @@ switch ($algorithm) {
   break;
   
   case 'utf8':
+  default :
+
     $match = array();
     $snippets = array();
-    $utf8_offset = $offset = 0;
+    $utf8_offset = $offset = $end = 0;
     $len = utf8_strlen($text);
+
     for ($cnt=3; $cnt--;) {
       if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
 
       list($str,$idx) = $match[0];
       
+      // is it ok to use utf8_substr() -- see bug #891,
+      //   check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters)
+      if ($idx <= 65135) {
+
       // convert $idx (a byte offset) into a utf8 character offset
       $utf8_idx = utf8_strlen(substr($text,0,$idx));
       $utf8_len = utf8_strlen($str);
@@ -328,39 +334,54 @@ switch ($algorithm) {
       // first look to see if we can go 100 either side,
       // then drop to 50 adding any excess if the other side can't go to 50,
       // NOTE: these are byte adjustments and will have to be corrected for utf-8
-      $pre = min($utf8_idx-$utf8_offset,100);
-      $post = min($len-$utf8_idx-$utf8_len,100);
+        $pre = min($utf8_idx-$utf8_offset,100);
+        $post = min($len-$utf8_idx-$utf8_len,100);
+
+        if ($pre>50 && $post>50) {
+          $pre = $post = 50;
+        } else if ($pre>50) {
+          $pre = min($pre,100-$post);
+        } else if ($post>50) {
+          $post = min($post, 100-$pre);
+        } else {
+          // both are less than 50, means the context is the whole string
+          // make it so and break out of this loop - there is no need for the complex snippet calculations
+          $snippets = array($text);
+          break;
+        }
 
-      if ($pre>50 && $post>50) {
-        $pre = $post = 50;
-      } else if ($pre>50) {
-        $pre = min($pre,100-$post);
-      } else if ($post>50) {
-        $post = min($post, 100-$pre);
-      } else {
-        // both are less than 50, means the context is the whole string
-        // make it so and break out of this loop - there is no need for the complex snippet calculations
-        $snippets = array($text);
-        break;
-      }
+        // establish context start and end points, try to append to previous context if possible
+        $start = $utf8_idx - $pre;
+        $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
+        $end = $utf8_idx + $utf8_len + $post;           // now set it to the end of this context
 
-      // establish context start and end points, try to append to previous context if possible
-      $start = $idx - $pre;
-      $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
-      $end = $idx + $utf8_len + $post;                // now set it to the end of this context
+        if ($append) {
+          $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+        } else {
+          $snippets[] = utf8_substr($text,$start,$end-$start);
+        }
 
-      if ($append) {
-        $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+        // set $offset for next match attempt
+        //   substract strlen to avoid splitting a potential search success, this is an approximation as the
+        //   search pattern may match strings of varying length and it will fail if the context snippet
+        //   boundary breaks a matching string longer than the current match
+        $utf8_offset = $utf8_idx + $post;
+        $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
+        $offset = utf8_correctIdx($text,$offset);
       } else {
-        $snippets[] = utf8_substr($text,$start,$end-$start);
+        // code for strings too large for utf8_substr
+        // use a larger context number as its bytes not characters
+        $pre = 70;
+        $post = min(strlen($text)-$idx-strlen($str), 70);
+        if ($post < 70) { $pre = 70 - $post; }
+
+        $start = utf8_correctIdx($text,$idx - $pre);
+        $end = utf8_correctIdx($text, $idx + strlen($str) + $post);
+
+        $snippets[] = substr($text,$start,$end-$start);
+        $offset = $end - strlen($str);
       }
 
-      // set $offset for next match attempt
-      //   substract strlen to avoid splitting a potential search success, this is an approximation as the
-      //   search pattern may match strings of varying length and it will fail if the context snippet
-      //   boundary breaks a matching string longer than the current match
-      $utf8_offset = $end - $utf8_len;
-      $offset = utf8_correctIdx($text,strlen(substr($text,0,$utf8_offset)));
     }
     $m = "\1";
     $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
diff --git a/inc/indexer.php b/inc/indexer.php
index 9af4b5b84..a2b7a0637 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -233,6 +233,8 @@ function idx_lookup($words){
         if(substr($xword,0,1) == '*'){
             $xword = substr($xword,1);
             $wild  = 1;
+            $ptn = '/'.preg_quote($xword,'/').'$/';
+#            $l = -1*strlen($xword)-1;
         }
         if(substr($xword,-1,1) == '*'){
             $xword = substr($xword,0,-1);
@@ -245,8 +247,11 @@ function idx_lookup($words){
             for($wid=0; $wid<$cnt; $wid++){
                 $iword = $word_idx[$wid];
                 if( (($wild==3) && is_int(strpos($iword,$xword))) ||
-                    (($wild==1) && ("$xword\n" == substr($iword,(-1*strlen($xword))-1))) ||
-                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+#                    (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
+                    (($wild==1) && preg_match($ptn,$iword)) ||
+#                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+                    (($wild==2) && (0 === strpos($iword,$xword)))
+
                   ){
                     $wids[] = $wid;
                     $result[$word][] = $wid;
diff --git a/inc/utf8.php b/inc/utf8.php
index aa9594c42..dbf09b6fc 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -13,6 +13,7 @@
 if(!defined('UTF8_MBSTRING')){
   if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
     define('UTF8_MBSTRING',1);
+    mb_internal_encoding('UTF-8');
   }else{
     define('UTF8_MBSTRING',0);
   }
-- 
GitLab