Skip to content
Snippets Groups Projects
Commit 91bb5faa authored by Andreas Gohr's avatar Andreas Gohr
Browse files

ignore regexp failures when handling asian chars

The new handling of asian chars as single words needs a recent PCRE library
(PHP 4.3.10 is known work). If this support isn't available the regexp
compilation will fail. This patch adds a workaround - this means the search
will not work as expected with asian words on older PHP versions.

darcs-hash:20051009124833-7ad00-1319829be5cb73246e13eb65e4c950d43c6ce5bf.gz
parent 037f7611
No related branches found
No related tags found
No related merge requests found
......@@ -265,7 +265,7 @@ function ft_queryParser($query){
if(count($token)) $q['not'] = array_merge($q['not'],$token);
}else{
// asian "words" need to be searched as phrases
if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
$q['phrases'] = array_merge($q['phrases'],$matches[1]);
}
......
......@@ -17,12 +17,12 @@
// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
// I'm no language expert. If you think some ranges are wrongly chosen or
// a range is missing, please contact me
define(IDX_ASIAN,'['.
'\x{0E00}-\x{0E7F}'. // Thai
'\x{2E80}-\x{D7AF}'. // CJK -> Hangul
'\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
'\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
']');
define('IDX_ASIAN','['.
'\x{0E00}-\x{0E7F}'. // Thai
'\x{2E80}-\x{D7AF}'. // CJK -> Hangul
'\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs
'\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms
']');
/**
......@@ -52,8 +52,9 @@ function idx_getPageWords($page){
foreach ($tokens as $word => $count) {
// simple filter to restrict use of utf8_stripspecials
if (preg_match('/[^0-9A-Za-z]/u', $word)) {
// handle asian chars as single words
$word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
if(!is_null($asia)) $word = $asia; //recover from regexp failure
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
$arr = array_count_values($arr);
......@@ -326,8 +327,9 @@ function idx_tokenizer($string,&$stopwords){
$words = array();
if(preg_match('/[^0-9A-Za-z]/u', $string)){
#handle asian chars as single words
$string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
if(!is_null($asia)) $string = $asia; //recover from regexp failure
$arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
foreach ($arr as $w) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment