From 5953e88907368380d326c187b3d1071f575c7daf Mon Sep 17 00:00:00 2001
From: chris <chris@jalakai.co.uk>
Date: Sat, 26 Aug 2006 11:53:11 +0200
Subject: [PATCH] ft_snippet() update, fix utf8 problems

darcs-hash:20060826095311-9b6ab-9a6f272cc7c7532eb2bad8f7b4404c5a16b71109.gz
---
 _test/cases/inc/utf8_correctidx.test.php | 25 ++++++++++++++++++++++++
 inc/fulltext.php                         | 13 ++++++------
 inc/utf8.php                             | 25 ++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 6 deletions(-)
 create mode 100644 _test/cases/inc/utf8_correctidx.test.php

diff --git a/_test/cases/inc/utf8_correctidx.test.php b/_test/cases/inc/utf8_correctidx.test.php
new file mode 100644
index 000000000..1e7abf04a
--- /dev/null
+++ b/_test/cases/inc/utf8_correctidx.test.php
@@ -0,0 +1,25 @@
+<?php
+// use no mbstring help here
+if(!defined('UTF8_NOMBSTRING')) define('UTF8_NOMBSTRING',1);
+require_once DOKU_INC.'inc/utf8.php';
+
+class utf8_correctidx_test extends UnitTestCase {
+
+
+    function test1(){
+        // we test multiple cases here - format: in, offset, length, out
+        $tests   = array();
+
+        $tests[] = array('Ð¶Ð¸Ð²Ï€á½±ìš°ë¦¬ã‚’ã‚Ã¶Ã¤',1,false,0);
+        $tests[] = array('Ð¶Ð¸Ð²Ï€á½±ìš°ë¦¬ã‚’ã‚Ã¶Ã¤',2,false,2);
+        $tests[] = array('Ð¶Ð¸Ð²Ï€á½±ìš°ë¦¬ã‚’ã‚Ã¶Ã¤',1,true,2);
+        $tests[] = array('Ð¶Ð¸Ð²Ï€á½±ìš°ë¦¬ã‚’ã‚Ã¶Ã¤',0,false,0);
+        $tests[] = array('Ð¶Ð¸Ð²Ï€á½±ìš°ë¦¬ã‚’ã‚Ã¶Ã¤',2,true,2);
+
+        foreach($tests as $test){
+            $this->assertEqual(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]);
+        }
+    }
+
+}
+//Setup VIM: ex: et ts=4 enc=utf-8 :
diff --git a/inc/fulltext.php b/inc/fulltext.php
index de1a4217b..6ab22a5c2 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -267,9 +267,10 @@ switch ($algorithm) {
 
       list($str,$idx) = $match[0];
 
-      // establish context, 100 characters surrounding the match string
+      // establish context, 100 bytes surrounding the match string
       // first look to see if we can go 100 either side,
-      // then drop to 50 adding any excess if the other side can't go to 50.
+      // then drop to 50 adding any excess if the other side can't go to 50,
+      // NOTE: these are byte adjustments and will have to be corrected for utf-8
       $pre = min($idx-$offset,100);
       $post = min($len-$idx-strlen($str),100);
 
@@ -282,9 +283,9 @@ switch ($algorithm) {
       }
 
       // establish context start and end points, try to append to previous context if possible
-      $start = $idx - $pre;
-      $append = ($start < $end) ? $end : false;   // still the end of the previous context snippet
-      $end = $idx + strlen($str) + $post;         // now set it to the end of this context
+      $start = utf8_correctIdx($text,$idx - $pre);
+      $append = ($start < $end) ? $end : false;                     // still the end of the previous context snippet
+      $end = utf8_correctIdx($text, $idx + strlen($str) + $post);   // now set it to the end of this context
 
       if ($append) {
         $snippets[count($snippets)-1] .= substr($text,$append,$end-$append);
@@ -305,7 +306,7 @@ switch ($algorithm) {
   break;
 }
 
-    return utf8_bad_replace($snippet);
+    return $snippet;
 }
 
 /**
diff --git a/inc/utf8.php b/inc/utf8.php
index 16722ab2e..0323bed4b 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -762,6 +762,31 @@ function utf8_bad_replace($str, $replace = '') {
     return $result;
 }
 
+/**
+ * adjust a byte index into a utf8 string to a utf8 character boundary
+ *
+ * @param $str   string   utf8 character string
+ * @param $i     int      byte index into $str
+ * @param $next  bool     direction to search for boundary, 
+ *                           false = up (current character)
+ *                           true = down (next character)
+ *
+ * @return int            byte index into $str now pointing to a utf8 character boundary
+ *
+ * @author       chris smith <chris@jalakai.co.uk>
+ */
+function utf8_correctIdx(&$str,$i,$next=false) {
+	
+  if ($next) {
+	  $limit = strlen($str);
+	  while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
+	} else {
+	  while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
+	}
+	
+	return $i;
+}
+
 // only needed if no mb_string available
 if(!UTF8_MBSTRING){
 
-- 
GitLab