diff --git a/_test/cases/inc/utf8_correctidx.test.php b/_test/cases/inc/utf8_correctidx.test.php new file mode 100644 index 0000000000000000000000000000000000000000..1e7abf04a7cff87dbab4304fb30f5a91abe750eb --- /dev/null +++ b/_test/cases/inc/utf8_correctidx.test.php @@ -0,0 +1,25 @@ +<?php +// use no mbstring help here +if(!defined('UTF8_NOMBSTRING')) define('UTF8_NOMBSTRING',1); +require_once DOKU_INC.'inc/utf8.php'; + +class utf8_correctidx_test extends UnitTestCase { + + + function test1(){ + // we test multiple cases here - format: in, offset, length, out + $tests = array(); + + $tests[] = array('живπά우리をã‚öä',1,false,0); + $tests[] = array('живπά우리をã‚öä',2,false,2); + $tests[] = array('живπά우리をã‚öä',1,true,2); + $tests[] = array('живπά우리をã‚öä',0,false,0); + $tests[] = array('живπά우리をã‚öä',2,true,2); + + foreach($tests as $test){ + $this->assertEqual(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]); + } + } + +} +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/fulltext.php b/inc/fulltext.php index de1a4217bf0aafea345863e58ec6e0d16b00427c..6ab22a5c20c0453577191ca129a6e9b69eeed1de 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -267,9 +267,10 @@ switch ($algorithm) { list($str,$idx) = $match[0]; - // establish context, 100 characters surrounding the match string + // establish context, 100 bytes surrounding the match string // first look to see if we can go 100 either side, - // then drop to 50 adding any excess if the other side can't go to 50. + // then drop to 50 adding any excess if the other side can't go to 50, + // NOTE: these are byte adjustments and will have to be corrected for utf-8 $pre = min($idx-$offset,100); $post = min($len-$idx-strlen($str),100); @@ -282,9 +283,9 @@ switch ($algorithm) { } // establish context start and end points, try to append to previous context if possible - $start = $idx - $pre; - $append = ($start < $end) ? $end : false; // still the end of the previous context snippet - $end = $idx + strlen($str) + $post; // now set it to the end of this context + $start = utf8_correctIdx($text,$idx - $pre); + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = utf8_correctIdx($text, $idx + strlen($str) + $post); // now set it to the end of this context if ($append) { $snippets[count($snippets)-1] .= substr($text,$append,$end-$append); @@ -305,7 +306,7 @@ switch ($algorithm) { break; } - return utf8_bad_replace($snippet); + return $snippet; } /** diff --git a/inc/utf8.php b/inc/utf8.php index 16722ab2e4a394694ab2051facf4503d7eb0e7bb..0323bed4b47522a839c7b58fe7015cad3c9552c4 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -762,6 +762,31 @@ function utf8_bad_replace($str, $replace = '') { return $result; } +/** + * adjust a byte index into a utf8 string to a utf8 character boundary + * + * @param $str string utf8 character string + * @param $i int byte index into $str + * @param $next bool direction to search for boundary, + * false = up (current character) + * true = down (next character) + * + * @return int byte index into $str now pointing to a utf8 character boundary + * + * @author chris smith <chris@jalakai.co.uk> + */ +function utf8_correctIdx(&$str,$i,$next=false) { + + if ($next) { + $limit = strlen($str); + while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; + } else { + while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; + } + + return $i; +} + // only needed if no mb_string available if(!UTF8_MBSTRING){