From 579b0f7e8d80287b11fd441dfa68d15e9d4bb74c Mon Sep 17 00:00:00 2001
From: TNHarris <telliamed@fastmail.us>
Date: Sun, 12 Nov 2006 20:49:00 +0100
Subject: [PATCH] Word-Length Indexer

A modification to the indexer that sorts words based on length. This should make
searching a little bit more efficient. After the patch is applied, your old index
will be automatically converted to the new format (when you visit a page). The
new index format is:

1. Index files are stored in savedir/index
2. Word lists are stored as wlen.idx. This used to be word.idx.
3. Word indexes are stored as ilen.idx. This used to be index.idx.
4. The page list, page.idx, is simply copied to the new location.

Any plugins you have, such as the blog plugin, that read the index files need to
be updated.

darcs-hash:20061112194900-2b9f0-a975498ccf0a1d39c6df73b79bcd028d5e81c389.gz
---
 data/index/_dummy                     |   0
 inc/fulltext.php                      |   2 +-
 inc/indexer.php                       | 319 ++++++++++++++++----------
 inc/init.php                          |   5 +-
 lib/exe/indexer.php                   |  14 +-
 lib/plugins/importoldindex/action.php |  58 +++++
 6 files changed, 273 insertions(+), 125 deletions(-)
 create mode 100644 data/index/_dummy
 create mode 100644 lib/plugins/importoldindex/action.php

diff --git a/data/index/_dummy b/data/index/_dummy
new file mode 100644
index 000000000..e69de29bb
diff --git a/inc/fulltext.php b/inc/fulltext.php
index 448f72248..1534ec1a8 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -149,7 +149,7 @@ function ft_backlinks($id){
 function ft_pageLookup($id,$pageonly=true){
     global $conf;
     $id    = preg_quote($id,'/');
-    $pages = file($conf['cachedir'].'/page.idx');
+    $pages = file($conf['indexdir'].'/page.idx');
     $pages = array_values(preg_grep('/'.$id.'/',$pages));
 
     $cnt = count($pages);
diff --git a/inc/indexer.php b/inc/indexer.php
index e6550c2e4..c90f2b179 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -25,17 +25,60 @@ define('IDX_ASIAN','['.
                    ']');
 
 
+/**
+ * Write a list of strings to an index file.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_saveIndex($pre, $wlen, $idx){
+    global $conf;
+    $fn = $conf['indexdir'].'/'.$pre.$wlen;
+    $fh = @fopen($fn.'.tmp','w');
+    if(!$fh) return false;
+    fwrite($fh,join('',$idx));
+    fclose($fh);
+    if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+    io_rename($fn.'.tmp', $fn.'.idx');
+    return true;
+}
+
+/**
+ * Read the list of words in an index (if it exists).
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_getIndex($pre, $wlen){
+    global $conf;
+    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+    if(!@file_exists($fn)) return array();
+    return file($fn);
+}
+
+/**
+ * Create an empty index file if it doesn't exist yet.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_touchIndex($pre, $wlen){
+    global $conf;
+    $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
+    if(!@file_exists($fn)){
+        touch($fn);
+        if($conf['fperm']) chmod($fn, $conf['fperm']);
+    }
+}
+
 /**
  * Split a page into words
  *
- * Returns an array of of word counts, false if an error occured
+ * Returns an array of word counts, false if an error occured.
+ * Array is keyed on the word length, then the word index.
  *
  * @author Andreas Gohr <andi@splitbrain.org>
  * @author Christopher Smith <chris@jalakai.co.uk>
  */
 function idx_getPageWords($page){
     global $conf;
-    $word_idx = file($conf['cachedir'].'/word.idx');
     $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
     if(@file_exists($swfile)){
         $stopwords = file($swfile);
@@ -65,47 +108,40 @@ function idx_getPageWords($page){
 
     $words = array();
     foreach ($tokens as $word => $count) {
-        // simple filter to restrict use of utf8_stripspecials
-        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
-            // handle asian chars as single words (may fail on older PHP version)
-            $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
-            if(!is_null($asia)) $word = $asia; //recover from regexp failure
-            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
-            $arr = array_count_values($arr);
-
-            foreach ($arr as $w => $c) {
-                if (!is_numeric($w) && strlen($w) < 3) continue;
-                $w = utf8_strtolower($w);
-                $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
+        $arr = idx_tokenizer($word,$stopwords);
+        $arr = array_count_values($arr);
+        foreach ($arr as $w => $c) {
+            $l = strlen($w);
+            if(isset($words[$l])){
+                $words[$l][$w] = $c * $count + (isset($words[$l][$w])) ? $words[$l][$w] : 0;
+            }else{
+                $words[$l] = array($w => $c * $count);
             }
-        } else {
-            if (!is_numeric($word) && strlen($word) < 3) continue;
-            $word = strtolower($word);
-            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
         }
     }
 
-    // arrive here with $words = array(word => frequency)
+    // arrive here with $words = array(wordlen => array(word => frequency))
 
     $index = array(); //resulting index
-    foreach ($words as $word => $freq) {
-    if (is_int(array_search("$word\n",$stopwords))) continue;
-        $wid = array_search("$word\n",$word_idx);
-        if(!is_int($wid)){
-            $word_idx[] = "$word\n";
-            $wid = count($word_idx)-1;
+    foreach (array_keys($words) as $wlen){
+        $word_idx = idx_getIndex('w',$wlen);
+        foreach ($words[$wlen] as $word => $freq) {
+            $wid = array_search("$word\n",$word_idx);
+            if(!is_int($wid)){
+                $word_idx[] = "$word\n";
+                $wid = count($word_idx)-1;
+            }
+            if(!isset($index[$wlen]))
+                $index[$wlen] = array();
+            $index[$wlen][$wid] = $freq;
         }
-        $index[$wid] = $freq;
-    }
 
-    // save back word index
-    $fh = fopen($conf['cachedir'].'/word.idx','w');
-    if(!$fh){
-        trigger_error("Failed to write word.idx", E_USER_ERROR);
-        return false;
+        // save back word index
+        if(!idx_saveIndex('w',$wlen,$word_idx)){
+            trigger_error("Failed to write word index", E_USER_ERROR);
+            return false;
+        }
     }
-    fwrite($fh,join('',$word_idx));
-    fclose($fh);
 
     return $index;
 }
@@ -123,7 +159,7 @@ function idx_addPage($page){
     global $conf;
 
     // load known documents
-    $page_idx = file($conf['cachedir'].'/page.idx');
+    $page_idx = idx_getIndex('page','');
 
     // get page id (this is the linenumber in page.idx)
     $pid = array_search("$page\n",$page_idx);
@@ -131,10 +167,8 @@ function idx_addPage($page){
         $page_idx[] = "$page\n";
         $pid = count($page_idx)-1;
         // page was new - write back
-        $fh = fopen($conf['cachedir'].'/page.idx','w');
-        if(!$fh) return false;
-        fwrite($fh,join('',$page_idx));
-        fclose($fh);
+        if (!idx_saveIndex('page','',$page_idx))
+            return false;
     }
 
     // get word usage in page
@@ -142,46 +176,51 @@ function idx_addPage($page){
     if($words === false) return false;
     if(!count($words)) return true;
 
-    // Open index and temp file
-    $idx = fopen($conf['cachedir'].'/index.idx','r');
-    $tmp = fopen($conf['cachedir'].'/index.tmp','w');
-    if(!$idx || !$tmp){
-       trigger_error("Failed to open index files", E_USER_ERROR);
-       return false;
-    }
+    foreach(array_keys($words) as $wlen){
+        // Open index and temp file
+        $fn = $conf['indexdir']."/i$wlen";
+        idx_touchIndex('i',$wlen);
+        $idx = fopen($fn.'.idx','r');
+        $tmp = fopen($fn.'.tmp','w');
+        if(!$idx || !$tmp){
+            trigger_error("Failed to open index files", E_USER_ERROR);
+            return false;
+        }
 
-    // copy from index to temp file, modifying were needed
-    $lno = 0;
-    $line = '';
-    while (!feof($idx)) {
-        // read full line
-        $line .= fgets($idx, 4096);
-        if(substr($line,-1) != "\n") continue;
+        // copy from index to temp file, modifying where needed
+        $lno = 0;
+        $line = '';
+        while (!feof($idx)) {
+            // read full line
+            $line .= fgets($idx, 4096);
+            if(substr($line,-1) != "\n") continue;
 
-        // write a new Line to temp file
-        idx_writeIndexLine($tmp,$line,$pid,$words[$lno]);
+            // write a new Line to temp file
+            idx_writeIndexLine($tmp,$line,$pid,$words[$wlen][$lno]);
 
-        $line = ''; // reset line buffer
-        $lno++;     // increase linecounter
-    }
-    fclose($idx);
-
-    // add missing lines (usually index and word should contain
-    // the same number of lines, however if the page contained
-    // new words the word file has some more lines which need to
-    // be added here
-    $word_idx = file($conf['cachedir'].'/word.idx');
-    $wcnt = count($word_idx);
-    for($lno; $lno<$wcnt; $lno++){
-        idx_writeIndexLine($tmp,'',$pid,$words[$lno]);
+            $line = ''; // reset line buffer
+            $lno++;     // increase linecounter
+        }
+        fclose($idx);
+
+        // add missing lines (usually index and word should contain
+        // the same number of lines, however if the page contained
+        // new words the word file has some more lines which need to
+        // be added here
+        $word_idx = idx_getIndex('w',$wlen);
+        $wcnt = count($word_idx);
+        for($lno; $lno<$wcnt; $lno++){
+            idx_writeIndexLine($tmp,'',$pid,$words[$wlen][$lno]);
+        }
+
+        // close the temp file and move it over to be the new one
+        fclose($tmp);
+        if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+        // try rename first (fast) fallback to copy (slow)
+        io_rename($fn.'.tmp', $fn.'.idx');
     }
 
-    // close the temp file and move it over to be the new one
-    fclose($tmp);
-    // try rename first (fast) fallback to copy (slow)
-    io_rename($conf['cachedir'].'/index.tmp',
-              $conf['cachedir'].'/index.idx');
-    return false;
+    return true;
 }
 
 /**
@@ -217,6 +256,34 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
     fwrite($fh,"\n");
 }
 
+/**
+ * Get the word lengths that have been indexed.
+ *
+ * Reads the index directory and returns an array of lengths
+ * that there are indices for.
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ */
+function idx_indexLengths($minlen){
+    global $conf;
+    $dir = @opendir($conf['indexdir']);
+    if($dir===false)
+        return array();
+    $idx = array();
+    // Exact match first.
+    if(@file_exists($conf['indexdir']."/i$minlen.idx"))
+        $idx[] = $minlen;
+    while (($f = readdir($dir)) !== false) {
+        if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
+            $i = substr($f,1,-4);
+            if (is_numeric($i) && $i > $minlen)
+                $idx[] = $i;
+        }
+    }
+    closedir($dir);
+    return $idx;
+}
+
 /**
  * Lookup words in index
  *
@@ -234,8 +301,7 @@ function idx_lookup($words){
     $result = array();
 
     // load known words and documents
-    $page_idx = file($conf['cachedir'].'/page.idx');
-    $word_idx = file($conf['cachedir'].'/word.idx');
+    $page_idx = idx_getIndex('page','');
 
     // get word IDs
     $wids = array();
@@ -243,80 +309,93 @@ function idx_lookup($words){
         $result[$word] = array();
         $wild = 0;
         $xword = $word;
+        $wlen = strlen($word);
 
         // check for wildcards
         if(substr($xword,0,1) == '*'){
             $xword = substr($xword,1);
             $wild  = 1;
             $ptn = '/'.preg_quote($xword,'/').'$/';
+            $wlen -= 1;
 #            $l = -1*strlen($xword)-1;
         }
         if(substr($xword,-1,1) == '*'){
             $xword = substr($xword,0,-1);
             $wild += 2;
+            $wlen -= 1;
         }
+        if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
 
         // look for the ID(s) for the given word
         if($wild){  // handle wildcard search
-            $cnt = count($word_idx);
-            for($wid=0; $wid<$cnt; $wid++){
-                $iword = $word_idx[$wid];
-                if( (($wild==3) && is_int(strpos($iword,$xword))) ||
-#                    (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
-                    (($wild==1) && preg_match($ptn,$iword)) ||
-#                    (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
-                    (($wild==2) && (0 === strpos($iword,$xword)))
-
-                  ){
-                    $wids[] = $wid;
-                    $result[$word][] = $wid;
+            foreach (idx_indexLengths($wlen) as $ixlen){
+                $word_idx = idx_getIndex('w',$ixlen);
+                $cnt = count($word_idx);
+                for($wid=0; $wid<$cnt; $wid++){
+                    $iword = $word_idx[$wid];
+                    if( (($wild==3) && is_int(strpos($iword,$xword))) ||
+#                        (($wild==1) && ("$xword\n" == substr($iword,$l))) ||
+                        (($wild==1) && preg_match($ptn,$iword)) ||
+#                        (($wild==2) && ($xword == substr($iword,0,strlen($xword))))
+                        (($wild==2) && (0 === strpos($iword,$xword)))
+
+                      ){
+                        if(!isset($wids[$ixlen])) $wids[$ixlen] = array();
+                        $wids[$ixlen][] = $wid;
+                        $result[$word][] = "$ixlen*$wid";
+                    }
                 }
             }
         }else{     // handle exact search
+            $word_idx = idx_getIndex('w',$wlen);
             $wid = array_search("$word\n",$word_idx);
             if(is_int($wid)){
-                $wids[] = $wid;
-                $result[$word][] = $wid;
+                $wids[$wlen] = array($wid);
+                $result[$word][] = "$wlen*$wid";
             }else{
                 $result[$word] = array();
             }
         }
     }
-    sort($wids);
-    $wids = array_unique($wids);
-
-    // Open index
-    $idx = fopen($conf['cachedir'].'/index.idx','r');
-    if(!$idx){
-       msg("Failed to open index file",-1);
-       return false;
-    }
 
-    // Walk the index til the lines are found
     $docs = array();                          // hold docs found
-    $lno  = 0;
-    $line = '';
-    $srch = array_shift($wids);               // which word do we look for?
-    while (!feof($idx)) {
-        // read full line
-        $line .= fgets($idx, 4096);
-        if(substr($line,-1) != "\n") continue;
-        if($lno > $srch)             break;   // shouldn't happen
-
-
-        // do we want this line?
-        if($lno == $srch){
-            // add docs to list
-            $docs[$srch] = idx_parseIndexLine($page_idx,$line);
-
-            $srch = array_shift($wids);        // next word to look up
-            if($srch == null) break;           // no more words
+    foreach(array_keys($wids) as $wlen){
+        sort($wids[$wlen]);
+        $wids[$wlen] = array_unique($wids[$wlen]);
+
+        // Open index
+        idx_touchIndex('i',$wlen);
+        $idx = fopen($conf['indexdir']."/i$wlen.idx",'r');
+        if(!$idx){
+            msg("Failed to open index file",-1);
+            return false;
         }
 
-        $line = ''; // reset line buffer
-        $lno++;     // increase linecounter
+        // Walk the index til the lines are found
+        $lno  = 0;
+        $line = '';
+        $ixids =& $wids[$wlen];
+        $srch = array_shift($ixids);               // which word do we look for?
+        while (!feof($idx)) {
+            // read full line
+            $line .= fgets($idx, 4096);
+            if(substr($line,-1) != "\n") continue;
+            if($lno > $srch)             break;   // shouldn't happen
+
+            // do we want this line?
+            if($lno == $srch){
+                // add docs to list
+                $docs["$wlen*$srch"] = idx_parseIndexLine($page_idx,$line);
+
+                $srch = array_shift($ixids);        // next word to look up
+                if($srch == null) break;           // no more words
+            }
+
+            $line = ''; // reset line buffer
+            $lno++;     // increase linecounter
+        }
+        fclose($idx);
     }
-    fclose($idx);
 
 
     // merge found pages into final result array
diff --git a/inc/init.php b/inc/init.php
index 552f98526..c097cd5bd 100644
--- a/inc/init.php
+++ b/inc/init.php
@@ -133,6 +133,7 @@ function init_paths(){
                  'mediadir'  => 'media',
                  'metadir'   => 'meta',
                  'cachedir'  => 'cache',
+                 'indexdir'  => 'index',
                  'lockdir'   => 'locks');
 
   foreach($paths as $c => $p){
@@ -157,9 +158,7 @@ function init_paths(){
 function init_files(){
   global $conf;
 
-  $files = array( $conf['cachedir'].'/word.idx',
-                  $conf['cachedir'].'/page.idx',
-                  $conf['cachedir'].'/index.idx');
+  $files = array( $conf['indexdir'].'/page.idx');
 
   foreach($files as $file){
     if(!@file_exists($file)){
diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php
index 224c54311..12177406f 100644
--- a/lib/exe/indexer.php
+++ b/lib/exe/indexer.php
@@ -120,6 +120,18 @@ function runIndexer(){
     global $conf;
     print "runIndexer(): started".NL;
 
+    // Move index files (if needed)
+    // Uses the importoldindex plugin to upgrade the index automatically.
+    // FIXME: Remove this from runIndexer when it is no longer needed.
+    if (@file_exists($conf['cachedir'].'/page.idx') &&
+        (!@file_exists($conf['indexdir'].'/page.idx') ||
+         !filesize($conf['indexdir'].'/page.idx'))  &&
+        !@file_exists($conf['indexdir'].'/index_importing')) {
+        echo "trigger TEMPORARY_INDEX_UPGRADE_EVENT\n";
+        $tmp = array(); // no event data
+        trigger_event('TEMPORARY_INDEX_UPGRADE_EVENT', $tmp);
+    }
+
     $ID = cleanID($_REQUEST['id']);
     if(!$ID) return false;
 
@@ -233,7 +245,7 @@ function runSitemapper(){
        return false;
     }
 
-    $pages = file($conf['cachedir'].'/page.idx');
+    $pages = file($conf['indexdir'].'/page.idx');
     print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
 
     // build the sitemap
diff --git a/lib/plugins/importoldindex/action.php b/lib/plugins/importoldindex/action.php
new file mode 100644
index 000000000..26b37664c
--- /dev/null
+++ b/lib/plugins/importoldindex/action.php
@@ -0,0 +1,58 @@
+<?php
+// must be run within Dokuwiki
+if(!defined('DOKU_INC')) die();
+
+if(!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
+require_once(DOKU_PLUGIN.'action.php');
+
+class action_plugin_importoldindex extends DokuWiki_Action_Plugin {
+
+    function getInfo(){
+        return array(
+            'author' => 'Tom N Harris',
+            'email'  => 'tnharris@whoopdedo.org',
+            'date'   => '2006-11-09',
+            'name'   => 'Import Old Index',
+            'desc'   => 'Moves old index files to a new location, sorted by string length.',
+            'url'    => 'http://whoopdedo.org/doku/wiki'
+            );
+    }
+
+    function register(&$controller) {
+        $controller->register_hook('TEMPORARY_INDEX_UPGRADE_EVENT', 'BEFORE', $this, 'run_import');
+    }
+
+    function run_import(&$event, $args) {
+        global $conf;
+
+        touch($conf['indexdir'].'/index_importing'); // changelog importing lock
+        // load old index
+        $word_idx = file($conf['cachedir'].'/word.idx');
+        $idx = file($conf['cachedir'].'/index.idx');
+        $words = array();
+        for ($lno=0;$lno<count($word_idx);$lno++){
+            $wlen = strlen($word_idx[$lno])-1;
+            //if($wlen<3) continue;
+            if(!isset($words[$wlen])) $words[$wlen] = array();
+            $words[$wlen][] = $lno;
+        }
+
+        foreach (array_keys($words) as $wlen) {
+            $new_words = array();
+            $new_idx = array();
+            foreach ($words[$wlen] as $lno) {
+                $new_words[] = $word_idx[$lno];
+                $new_idx[] = $idx[$lno];
+            }
+            io_saveFile($conf['indexdir']."/w$wlen.idx", implode('', $new_words));
+            io_saveFile($conf['indexdir']."/i$wlen.idx", implode('', $new_idx));
+        }
+
+        @copy($conf['cachedir'].'/page.idx', $conf['indexdir'].'/page.idx');
+        if($conf['fperm']) chmod($conf['indexdir'].'/page.idx', $conf['fperm']);
+        unlink($conf['indexdir'].'/index_importing'); // changelog importing unlock
+        plugin_disable('importoldindex'); // only needs to run once
+    }
+
+}
+
-- 
GitLab