From 91bb5faaff4ff41771606c58f608afd76263b8c7 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sun, 9 Oct 2005 14:48:33 +0200
Subject: [PATCH] ignore regexp failures when handling asian chars

The new handling of asian chars as single words needs a recent PCRE library
(PHP 4.3.10 is known work). If this support isn't available the regexp
compilation will fail. This patch adds a workaround - this means the search
will not work as expected with asian words on older PHP versions.

darcs-hash:20051009124833-7ad00-1319829be5cb73246e13eb65e4c950d43c6ce5bf.gz
---
 inc/fulltext.php |  2 +-
 inc/indexer.php  | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/inc/fulltext.php b/inc/fulltext.php
index 89fa5b259..4d4b8138c 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -265,7 +265,7 @@ function ft_queryParser($query){
             if(count($token)) $q['not'] = array_merge($q['not'],$token);
         }else{
             // asian "words" need to be searched as phrases
-            if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
+            if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
                 $q['phrases'] = array_merge($q['phrases'],$matches[1]);
 
             }
diff --git a/inc/indexer.php b/inc/indexer.php
index a8511b1ee..22bd8566b 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -17,12 +17,12 @@
 // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
 // I'm no language expert. If you think some ranges are wrongly chosen or
 // a range is missing, please contact me
-define(IDX_ASIAN,'['.
-                 '\x{0E00}-\x{0E7F}'.  // Thai
-                 '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
-                 '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
-                 '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
-                 ']');
+define('IDX_ASIAN','['.
+                   '\x{0E00}-\x{0E7F}'.  // Thai
+                   '\x{2E80}-\x{D7AF}'.  // CJK -> Hangul
+                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
+                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
+                   ']');
 
 
 /**
@@ -52,8 +52,9 @@ function idx_getPageWords($page){
     foreach ($tokens as $word => $count) {
         // simple filter to restrict use of utf8_stripspecials 
         if (preg_match('/[^0-9A-Za-z]/u', $word)) {
-            // handle asian chars as single words
-            $word = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
+            // handle asian chars as single words (may fail on older PHP version)
+            $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
+            if(!is_null($asia)) $word = $asia; //recover from regexp failure
             $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
             $arr = array_count_values($arr);
             
@@ -326,8 +327,9 @@ function idx_tokenizer($string,&$stopwords){
     $words = array();
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
-        #handle asian chars as single words
-        $string = preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+        // handle asian chars as single words (may fail on older PHP version)
+        $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$string);
+        if(!is_null($asia)) $string = $asia; //recover from regexp failure
 
         $arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
         foreach ($arr as $w) {
-- 
GitLab