more utf8_substr improvements (re FS#891 and yesterday's patch)

- rework utf8_substr() NOMBSTRING code to always use pcre - remove work around for utf8_substr() and large strings from ft_snippet() darcs-hash:20060928165122-9b6ab-0eefc216f07f9d7e7d8eb62ce26605c28ee340fa.gz

more utf8_substr improvements (re FS#891 and yesterday's patch)
2626ee0c · chris · d07dd8ee · 2626ee0c · 2626ee0c
Commit 2626ee0c authored 18 years ago by chris
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -328,10 +328,6 @@ switch ($algorithm) {

      list($str,$idx) = $match[0];
      
-      // is it ok to use utf8_substr() -- see bug #891,
-      //   check idx against (2^16)-1 - 400 (100x4 byte utf-8 characters)
-      if ($idx <= 65135) {
-
      // convert $idx (a byte offset) into a utf8 character offset
      $utf8_idx = utf8_strlen(substr($text,0,$idx));
      $utf8_len = utf8_strlen($str);
@@ -339,55 +335,42 @@ switch ($algorithm) {
      // establish context, 100 bytes surrounding the match string
      // first look to see if we can go 100 either side,
      // then drop to 50 adding any excess if the other side can't go to 50,
-        $pre = min($utf8_idx-$utf8_offset,100);
-        $post = min($len-$utf8_idx-$utf8_len,100);
-
-        if ($pre>50 && $post>50) {
-          $pre = $post = 50;
-        } else if ($pre>50) {
-          $pre = min($pre,100-$post);
-        } else if ($post>50) {
-          $post = min($post, 100-$pre);
-        } else {
-          // both are less than 50, means the context is the whole string
-          // make it so and break out of this loop - there is no need for the complex snippet calculations
-          $snippets = array($text);
-          break;
-        }
-
-        // establish context start and end points, try to append to previous context if possible
-        $start = $utf8_idx - $pre;
-        $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
-        $end = $utf8_idx + $utf8_len + $post;           // now set it to the end of this context
+      $pre = min($utf8_idx-$utf8_offset,100);
+      $post = min($len-$utf8_idx-$utf8_len,100);

-        if ($append) {
-          $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
-        } else {
-          $snippets[] = utf8_substr($text,$start,$end-$start);
-        }
-
-        // set $offset for next match attempt
-        //   substract strlen to avoid splitting a potential search success, this is an approximation as the
-        //   search pattern may match strings of varying length and it will fail if the context snippet
-        //   boundary breaks a matching string longer than the current match
-        $utf8_offset = $utf8_idx + $post;
-        $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
-        $offset = utf8_correctIdx($text,$offset);
+      if ($pre>50 && $post>50) {
+        $pre = $post = 50;
+      } else if ($pre>50) {
+        $pre = min($pre,100-$post);
+      } else if ($post>50) {
+        $post = min($post, 100-$pre);
      } else {
-        // code for strings too large for utf8_substr
-        // use a larger context number as its bytes not characters
-        // no need to check for short pre, $idx is nearly 64k
-        $post = min(strlen($text)-$idx-strlen($str), 70);
-        $pre = ($post < 70) ?  140 - $post : 70;
+        // both are less than 50, means the context is the whole string
+        // make it so and break out of this loop - there is no need for the complex snippet calculations
+        $snippets = array($text);
+        break;
+      }

-        $start = utf8_correctIdx($text,$idx - $pre);
-        $end = utf8_correctIdx($text, $idx + strlen($str) + $post);
+      // establish context start and end points, try to append to previous context if possible
+      $start = $utf8_idx - $pre;
+      $append = ($start < $end) ? $end : false;       // still the end of the previous context snippet
+      $end = $utf8_idx + $utf8_len + $post;           // now set it to the end of this context

-        $snippets[] = substr($text,$start,$end-$start);
-        $offset = $end - strlen($str);
+      if ($append) {
+        $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
+      } else {
+        $snippets[] = utf8_substr($text,$start,$end-$start);
      }

+      // set $offset for next match attempt
+      //   substract strlen to avoid splitting a potential search success, this is an approximation as the
+      //   search pattern may match strings of varying length and it will fail if the context snippet
+      //   boundary breaks a matching string longer than the current match
+      $utf8_offset = $utf8_idx + $post;
+      $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
+      $offset = utf8_correctIdx($text,$offset);
    }
+
    $m = "\1";
    $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
    $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<span class="search_hit">$1</span>',hsc(join('... ',$snippets)));

--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -127,8 +127,6 @@ function utf8_strlen($string){
 * UTF-8 aware alternative to substr
 *
 * Return part of a string given character offset (and optionally length)
- * Note: supports use of negative offsets and lengths but will be slower
- * when doing so
 *
 * @author Harry Fuecks <hfuecks@gmail.com>
 * @author Chris Smith <chris@jalakai.co.uk>
@@ -146,61 +144,86 @@ function utf8_substr($str, $offset, $length = null) {
        }
    }

-    if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) {
-        if ( $length === null ) {
-            $length = '*';
-        } else {
-            $strlen = strlen(utf8_decode($str));
-            if ( $offset > $strlen ) {
-                return '';
-            }
-
-            if ( ( $offset + $length ) > $strlen ) {
-               $length = '*';
-            } else {
-                $length = '{'.$length.'}';
-            }
-        }
+    /*
+     * Notes:
+     *
+     * no mb string support, so we'll use pcre regex's with 'u' flag
+     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
+     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
+     *
+     * substr documentation states false can be returned in some cases (e.g. offset > string length)
+     * mb_substr never returns false, it will return an empty string instead.
+     *
+     * calculating the number of characters in the string is a relatively expensive operation, so
+     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
+     */
+
+    // cast parameters to appropriate types to avoid multiple notices/warnings
+    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
+    $offset = (int)$offset;
+    if (!is_null($length)) $length = (int)$length;
+
+    // handle trivial cases
+    if ($length === 0) return '';
+    if ($offset < 0 && $length < 0 && $length < $offset) return '';
+
+    $offset_pattern = '';
+    $length_pattern = '';
+
+    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
+    if ($offset < 0) {
+      $strlen = strlen(utf8_decode($str));        // see notes
+      $offset = $strlen + $offset;
+      if ($offset < 0) $offset = 0;
+    }

-        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
-        preg_match($pattern, $str, $matches);
+    // establish a pattern for offset, a non-captured group equal in length to offset
+    if ($offset > 0) {
+      $Ox = (int)($offset/65535);
+      $Oy = $offset%65535;

-        if ( isset($matches[1]) ) {
-            return $matches[1];
-        }
-        return false;
+      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
+      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
+    } else {
+      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
+    }

+    // establish a pattern for length
+    if (is_null($length)) {
+      $length_pattern = '(.*)$';                  // the rest of the string
    } else {

-      // convert character offsets to byte offsets and use normal substr()
-      // 1. normalise paramters into positive offset and length and carry out simple checks
-      $strlen = strlen(utf8_decode($str));
+      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
+      if ($offset > $strlen) return '';           // another trivial case

-      if ($offset < 0) {
-        $offset = max($strlen+$offset,0);
-      }
-      if ($offset >= $strlen) return false;
+      if ($length > 0) {

-      if ($length === null) {
-        // 2a. convert to start byte offset
-        list($start) = _utf8_byteindex($str,$offset);
-				return substr($str,$start);
-      }
+        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string

-      if ($length < 0) {
-        $length = $strlen-$offset+$length;
-        if ($length < 0) return '';
-      }
+        $Lx = (int)($length/65535);
+        $Ly = $length%65535;
+
+        // +ve length requires ... a captured group of length characters
+        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
+        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
+
+      } else if ($length < 0) {

-      if ($length === 0) return '';
-      if ($strlen - $offset < $length) $length = $strlen-$offset;
+        if ($length < ($offset - $strlen)) return '';

-      // 2b. convert to start and end byte offsets
-      list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length);
-      return substr($str,$start,$end-$start);
+        $Lx = (int)((-$length)/65535);
+        $Ly = (-$length)%65535;
+
+        // -ve length requires ... capture everything except a group of -length characters 
+        //                         anchored at the tail-end of the string
+        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
+        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
+      }
    }
-}

+    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
+    return $match[1];
+}

 /**
 * Unicode aware replacement for substr_replace()
@@ -815,69 +838,6 @@ function utf8_correctIdx(&$str,$i,$next=false) {
  return $i;
 }

-/**
- * determine the byte indexes into a utf-8 string for one or more character offsets
- * PRIVATE  (could be made public with proper paramter checking)
- *
- * @author  Chris Smith <chris@jalakai.co.uk>
- *
- * @param   string    $str      utf8 string
- * @param   int       $offset   any number of character offsets into $str
- *
- * @return  array     byte indexes into $str, one index for each offset argument
- */
-function _utf8_byteindex() {
-
-  $args = func_get_args();
-  $str =& array_shift($args);
-  if (!is_string($str)) return false;
-
-  $result = array();
-
-  // use a short piece of str to estimate bytes per character
-  $i = utf8_correctIdx($str, 300, true);           // $i (& $j) -> byte indexes into $str
-  $c = utf8_strlen(substr($str,0,$i));             // $c -> character offset into $str
-
-  sort($args);                                     // deal with arguments from lowest to highest
-  foreach ($args as $offset) {
-    // sanity checks FIXME
-
-    // 0 is an easy check
-    if ($offset == 0) { $result[] = 0; continue; }
-
-    $safety_valve = 50;                            // ensure no endless looping
-
-    do {
-      $j = (int)($offset * $i/$c);                 // apply latest bytes/character estimate to offset
-      $j = utf8_correctIdx($str, $j, true);        // correct to utf8 character boundary
-
-      if ($j > $i) {
-        $c += utf8_strlen(substr($str,$i,$j-$i));  // determine new character offset
-      } else {
-        $c -= utf8_strlen(substr($str,$j,$i-$j));  // ditto
-      }
-
-      $error = abs($c-$offset);
-
-      $i = $j;                                     // ready for next time around
-    } while (($error > 7) && --$safety_valve) ;    // from 7 it is faster to iterate over the string
-
-    if ($error && $error <= 7) {
-      if ($c < $offset) {
-        // move up
-        while ($error--) { $i = utf8_correctIdx($str,++$i,true); }
-      } else {
-        // move down
-        while ($error--) { $i = utf8_correctIdx($str,--$i,false); }
-      }
-      $c = $offset;                                // ready for next arg
-    }
-    $result[] = $i;
-  }
-
-  return $result;  
-}
-
 // only needed if no mb_string available
 if(!UTF8_MBSTRING){