From 5953e88907368380d326c187b3d1071f575c7daf Mon Sep 17 00:00:00 2001 From: chris <chris@jalakai.co.uk> Date: Sat, 26 Aug 2006 11:53:11 +0200 Subject: [PATCH] ft_snippet() update, fix utf8 problems darcs-hash:20060826095311-9b6ab-9a6f272cc7c7532eb2bad8f7b4404c5a16b71109.gz --- _test/cases/inc/utf8_correctidx.test.php | 25 ++++++++++++++++++++++++ inc/fulltext.php | 13 ++++++------ inc/utf8.php | 25 ++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 _test/cases/inc/utf8_correctidx.test.php diff --git a/_test/cases/inc/utf8_correctidx.test.php b/_test/cases/inc/utf8_correctidx.test.php new file mode 100644 index 000000000..1e7abf04a --- /dev/null +++ b/_test/cases/inc/utf8_correctidx.test.php @@ -0,0 +1,25 @@ +<?php +// use no mbstring help here +if(!defined('UTF8_NOMBSTRING')) define('UTF8_NOMBSTRING',1); +require_once DOKU_INC.'inc/utf8.php'; + +class utf8_correctidx_test extends UnitTestCase { + + + function test1(){ + // we test multiple cases here - format: in, offset, length, out + $tests = array(); + + $tests[] = array('живπά우리をã‚öä',1,false,0); + $tests[] = array('живπά우리をã‚öä',2,false,2); + $tests[] = array('живπά우리をã‚öä',1,true,2); + $tests[] = array('живπά우리をã‚öä',0,false,0); + $tests[] = array('живπά우리をã‚öä',2,true,2); + + foreach($tests as $test){ + $this->assertEqual(utf8_correctIdx($test[0],$test[1],$test[2]),$test[3]); + } + } + +} +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/fulltext.php b/inc/fulltext.php index de1a4217b..6ab22a5c2 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -267,9 +267,10 @@ switch ($algorithm) { list($str,$idx) = $match[0]; - // establish context, 100 characters surrounding the match string + // establish context, 100 bytes surrounding the match string // first look to see if we can go 100 either side, - // then drop to 50 adding any excess if the other side can't go to 50. + // then drop to 50 adding any excess if the other side can't go to 50, + // NOTE: these are byte adjustments and will have to be corrected for utf-8 $pre = min($idx-$offset,100); $post = min($len-$idx-strlen($str),100); @@ -282,9 +283,9 @@ switch ($algorithm) { } // establish context start and end points, try to append to previous context if possible - $start = $idx - $pre; - $append = ($start < $end) ? $end : false; // still the end of the previous context snippet - $end = $idx + strlen($str) + $post; // now set it to the end of this context + $start = utf8_correctIdx($text,$idx - $pre); + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = utf8_correctIdx($text, $idx + strlen($str) + $post); // now set it to the end of this context if ($append) { $snippets[count($snippets)-1] .= substr($text,$append,$end-$append); @@ -305,7 +306,7 @@ switch ($algorithm) { break; } - return utf8_bad_replace($snippet); + return $snippet; } /** diff --git a/inc/utf8.php b/inc/utf8.php index 16722ab2e..0323bed4b 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -762,6 +762,31 @@ function utf8_bad_replace($str, $replace = '') { return $result; } +/** + * adjust a byte index into a utf8 string to a utf8 character boundary + * + * @param $str string utf8 character string + * @param $i int byte index into $str + * @param $next bool direction to search for boundary, + * false = up (current character) + * true = down (next character) + * + * @return int byte index into $str now pointing to a utf8 character boundary + * + * @author chris smith <chris@jalakai.co.uk> + */ +function utf8_correctIdx(&$str,$i,$next=false) { + + if ($next) { + $limit = strlen($str); + while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++; + } else { + while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--; + } + + return $i; +} + // only needed if no mb_string available if(!UTF8_MBSTRING){ -- GitLab