diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php index 2405494e09b3fd5ebd77cf73bafa82b37b729cf5..f10c70e5851e233d3dc45777120b67e2b4f65623 100644 --- a/conf/dokuwiki.php +++ b/conf/dokuwiki.php @@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor $conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard) $conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation $conf['rememberme'] = 1; //Enable/disable remember me on login +$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing +$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati'; //Set target to use when creating links - leave empty for same window $conf['target']['wiki'] = ''; diff --git a/inc/indexer.php b/inc/indexer.php index b3e10a54808aff5a2030e9bdf3bbe43b1dfd4934..1c955a99d932659159792c15c939cf5a51f2412c 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){ * @author Andreas Gohr <andi@splitbrain.org> */ function idx_tokenizer($string,&$stopwords,$wc=false){ + global $conf; $words = array(); $wc = ($wc) ? '' : $wc = '\*'; @@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ else $sw =& $stopwords; + if ($conf['external_tokenizer']) { + if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output)) + $string = $output; + } else { + if(preg_match('/[^0-9A-Za-z ]/u', $string)) { + // handle asian chars as single words (may fail on older PHP version) + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); + if(!is_null($asia)) $string = $asia; //recover from regexp failure + } + } $string = strtr($string, "\r\n\t", ' '); if(preg_match('/[^0-9A-Za-z ]/u', $string)) $string = utf8_stripspecials($string, ' ', '\._\-:'.$wc); @@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ $wordlist = explode(' ', $string); foreach ($wordlist as $word) { if(preg_match('/[^0-9A-Za-z]/u', $word)){ - // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word); - if(!is_null($asia)) $word = $asia; //recover from regexp failure - - $arr = explode(' ', $word); - foreach ($arr as $w) { - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = utf8_strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; - } + $word = utf8_strtolower($word); }else{ - $w = $word; - if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue; - $w = strtolower($w); - if(is_int(array_search("$w\n",$stopwords))) continue; - $words[] = $w; + $word = strtolower($word); } + if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; + if(is_int(array_search("$word\n",$stopwords))) continue; + $words[] = $word; } return $words; diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index a944d6bd72f8941e31e57eb48f191a6a28acc96d..85214bf9881f871fafaf11e586002e01820a112f 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output'; $lang['renderer__core'] = '%s (dokuwiki core)'; $lang['renderer__plugin'] = '%s (plugin)'; $lang['rememberme'] = 'Allow permanent login cookies (remember me)'; +$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing'; +$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer'; $lang['rss_type'] = 'XML feed type'; $lang['rss_linkto'] = 'XML feed links to'; diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index edba652620f406dbde5d3eb6eb7d665197f32034..331da5ab8d975ba32aedb59982f03822b12b6ddd 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff'); $meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3)); $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml')); $meta['readdircache'] = array('numeric'); +$meta['external_tokenizer'] = array('onoff'); +$meta['tokenizer_cmd'] = array('string'); $meta['_network'] = array('fieldset'); $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');