Skip to content
Snippets Groups Projects
Commit 1c07b9e6 authored by Tom N Harris's avatar Tom N Harris
Browse files

Use external program to split pages into words

An external tokenizer inserts extra spaces to mark words in the input text.
The text is sent through STDIN and STDOUT file handles.

A good choice for Chinese and Japanese is MeCab.
http://sourceforge.net/projects/mecab/
With the command line 'mecab -O wakati'
parent 6c528220
No related branches found
No related tags found
No related merge requests found
......@@ -133,6 +133,8 @@ $conf['broken_iua'] = 0; //Platform with broken ignore_user_abor
$conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard)
$conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation
$conf['rememberme'] = 1; //Enable/disable remember me on login
$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing
$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';
//Set target to use when creating links - leave empty for same window
$conf['target']['wiki'] = '';
......
......@@ -662,6 +662,7 @@ function idx_parseIndexLine(&$page_idx,$line){
* @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_tokenizer($string,&$stopwords,$wc=false){
global $conf;
$words = array();
$wc = ($wc) ? '' : $wc = '\*';
......@@ -670,6 +671,16 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
else
$sw =& $stopwords;
if ($conf['external_tokenizer']) {
if (0 == io_runcmd($conf['tokenizer_cmd'], $string, $output))
$string = $output;
} else {
if(preg_match('/[^0-9A-Za-z ]/u', $string)) {
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
if(!is_null($asia)) $string = $asia; //recover from regexp failure
}
}
$string = strtr($string, "\r\n\t", ' ');
if(preg_match('/[^0-9A-Za-z ]/u', $string))
$string = utf8_stripspecials($string, ' ', '\._\-:'.$wc);
......@@ -677,24 +688,13 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
$wordlist = explode(' ', $string);
foreach ($wordlist as $word) {
if(preg_match('/[^0-9A-Za-z]/u', $word)){
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$word);
if(!is_null($asia)) $word = $asia; //recover from regexp failure
$arr = explode(' ', $word);
foreach ($arr as $w) {
if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
$w = utf8_strtolower($w);
if(is_int(array_search("$w\n",$stopwords))) continue;
$words[] = $w;
}
$word = utf8_strtolower($word);
}else{
$w = $word;
if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) continue;
$w = strtolower($w);
if(is_int(array_search("$w\n",$stopwords))) continue;
$words[] = $w;
$word = strtolower($word);
}
if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue;
if(is_int(array_search("$word\n",$stopwords))) continue;
$words[] = $word;
}
return $words;
......
......@@ -141,6 +141,8 @@ $lang['renderer_xhtml'] = 'Renderer to use for main (xhtml) wiki output';
$lang['renderer__core'] = '%s (dokuwiki core)';
$lang['renderer__plugin'] = '%s (plugin)';
$lang['rememberme'] = 'Allow permanent login cookies (remember me)';
$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing';
$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer';
$lang['rss_type'] = 'XML feed type';
$lang['rss_linkto'] = 'XML feed links to';
......
......@@ -190,6 +190,8 @@ $meta['broken_iua'] = array('onoff');
$meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3));
$meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml'));
$meta['readdircache'] = array('numeric');
$meta['external_tokenizer'] = array('onoff');
$meta['tokenizer_cmd'] = array('string');
$meta['_network'] = array('fieldset');
$meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment