From b4ce25e9a449e7a6a78476bf94bca31cbc4259ce Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Sun, 7 Aug 2005 22:33:22 +0200
Subject: [PATCH] a first step for search indexing - nothing to see yet

darcs-hash:20050807203322-7ad00-6db6733f8fcd861366856635ba3d205fd3bb54da.gz
---
 inc/indexer.php | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 inc/utf8.php    |  9 ++++---
 2 files changed, 75 insertions(+), 4 deletions(-)
 create mode 100644 inc/indexer.php

diff --git a/inc/indexer.php b/inc/indexer.php
new file mode 100644
index 000000000..3fb710338
--- /dev/null
+++ b/inc/indexer.php
@@ -0,0 +1,70 @@
+<?php
+/**
+ * Common DokuWiki functions
+ *
+ * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
+ * @author     Andreas Gohr <andi@splitbrain.org>
+ */
+
+  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
+  require_once(DOKU_CONF.'dokuwiki.php');
+  require_once(DOKU_INC.'inc/io.php');
+  require_once(DOKU_INC.'inc/utf8.php');
+  require_once(DOKU_INC.'inc/parserutils.php');
+
+/**
+ * based upon class.search_indexer_phpcms.php::index_entry
+ */
+function idx_getPageWords($id){
+    $body  = rawWiki($id);
+    $body  = utf8_stripspecials($body,' ','._\-:');
+    $body  = utf8_strtolower($body);
+    $body  = trim($body);
+    $words = explode(' ',$body);
+    sort($words);
+
+    $index = array(); //resulting index
+    $old   = '';
+    $doit  = true;
+    $pos   = 0;
+
+    //compact wordlist FIXME check for stopwords
+
+    foreach($words as $word){
+        if(strlen($word) == 0) continue;
+
+        // it's the same word
+        if($word == $old){
+            if($doit == false) {
+                // we didn't wanted it last time
+                continue;
+            }
+            // just increase the counter
+            $index[$word]++;
+            continue;
+        }
+
+        // rememember old word
+        $old  = $word;
+        $doit = true;
+
+        // checking minimum word-size (excepting numbers)
+        if(!is_numeric($word)) {
+            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
+                $doit = false;
+                continue;
+            }
+        }
+      
+        //FIXME add stopword check
+
+        // add to index
+        $index[$word] = 1;
+    }
+
+    return $index;
+}
+
+
+
+//Setup VIM: ex: et ts=4 enc=utf-8 :
diff --git a/inc/utf8.php b/inc/utf8.php
index 3eb06865f..176b9f813 100644
--- a/inc/utf8.php
+++ b/inc/utf8.php
@@ -294,10 +294,11 @@ function utf8_deaccent($string,$case=0){
  * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
  *
  * @author Andreas Gohr <andi@splitbrain.org>
- * @param  string $string The UTF8 string to strip of special chars
- * @param  string $repl   Replace special with this string
+ * @param  string $string     The UTF8 string to strip of special chars
+ * @param  string $repl       Replace special with this string
+ * @param  string $additional Additional chars to strip (used in regexp char class)
  */
-function utf8_stripspecials($string,$repl=''){
+function utf8_stripspecials($string,$repl='',$additional=''){
   global $UTF8_SPECIAL_CHARS;
 
   static $specials = null;
@@ -305,7 +306,7 @@ function utf8_stripspecials($string,$repl=''){
     $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
   }
 
-  return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string);
+  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 }
 
 /**
-- 
GitLab