From 8cd4c12f3e3d5e9665f20afca85123145912c0e9 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sat, 19 Mar 2011 19:52:51 +0100
Subject: [PATCH] replace tokenizer_cmd with action hook

as discussed at
http://www.freelists.org/post/dokuwiki/tokenizer-cmd-in-indexer,1
---
 inc/indexer.php                               | 21 +++++++++++--------
 lib/plugins/config/lang/en/lang.php           |  2 --
 .../config/settings/config.metadata.php       |  2 --
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/inc/indexer.php b/inc/indexer.php
index 7cddb7c54..0fbd939be 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -54,7 +54,7 @@ define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
  * Version of the indexer taking into consideration the external tokenizer.
  * The indexer is only compatible with data written by the same version.
  *
- * Triggers INDEXER_VERSION_GET
+ * @triggers INDEXER_VERSION_GET
  * Plugins that modify what gets indexed should hook this event and
  * add their version info to the event data like so:
  *     $data[$plugin_name] = $plugin_version;
@@ -66,10 +66,7 @@ function idx_get_version(){
     static $indexer_version = null;
     if ($indexer_version == null) {
         global $conf;
-        if($conf['external_tokenizer'])
-            $version = INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']);
-        else
-            $version = INDEXER_VERSION;
+        $version = INDEXER_VERSION;
 
         // DokuWiki version is included for the convenience of plugins
         $data = array('dokuwiki'=>$version);
@@ -405,6 +402,10 @@ class Doku_Indexer {
      *
      * TODO: does this also need &$stopwords ?
      *
+     * @triggers INDEXER_TEXT_PREPARE
+     * This event allows plugins to modify the text before it gets tokenized.
+     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
+     *
      * @param string    $text   plain text
      * @param boolean   $wc     are wildcards allowed?
      * @return array            list of words in the text
@@ -417,16 +418,18 @@ class Doku_Indexer {
         $wc = ($wc) ? '' : '\*';
         $stopwords =& idx_get_stopwords();
 
-        if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') {
-            if (0 == io_exec($conf['tokenizer_cmd'], $text, $output))
-                $text = $output;
-        } else {
+        // prepare the text to be tokenized
+        $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text);
+        if ($evt->advise_before(true)) {
             if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
                 // handle asian chars as single words (may fail on older PHP version)
                 $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text);
                 if (!is_null($asia)) $text = $asia; // recover from regexp falure
             }
         }
+        $evt->advise_after();
+        unset($evt);
+
         $text = strtr($text,
                        array(
                            "\r" => ' ',
diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php
index d7a544850..18bfb56fa 100644
--- a/lib/plugins/config/lang/en/lang.php
+++ b/lib/plugins/config/lang/en/lang.php
@@ -142,8 +142,6 @@ $lang['renderer_xhtml']   = 'Renderer to use for main (xhtml) wiki output';
 $lang['renderer__core']   = '%s (dokuwiki core)';
 $lang['renderer__plugin'] = '%s (plugin)';
 $lang['rememberme'] = 'Allow permanent login cookies (remember me)';
-$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing';
-$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer';
 
 $lang['rss_type']    = 'XML feed type';
 $lang['rss_linkto']  = 'XML feed links to';
diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php
index ca2cd0c12..af7e63a61 100644
--- a/lib/plugins/config/settings/config.metadata.php
+++ b/lib/plugins/config/settings/config.metadata.php
@@ -194,8 +194,6 @@ $meta['broken_iua']  = array('onoff');
 $meta['xsendfile']   = array('multichoice','_choices' => array(0,1,2,3));
 $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml'));
 $meta['readdircache'] = array('numeric');
-$meta['external_tokenizer'] = array('onoff');
-$meta['tokenizer_cmd'] = array('string');
 
 $meta['_network']    = array('fieldset');
 $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');
-- 
GitLab