Skip to content
Snippets Groups Projects
Commit b4ce25e9 authored by Andreas Gohr's avatar Andreas Gohr
Browse files

a first step for search indexing - nothing to see yet

darcs-hash:20050807203322-7ad00-6db6733f8fcd861366856635ba3d205fd3bb54da.gz
parent 1caeb00a
No related branches found
No related tags found
No related merge requests found
<?php
/**
* Common DokuWiki functions
*
* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
* @author Andreas Gohr <andi@splitbrain.org>
*/
if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
require_once(DOKU_CONF.'dokuwiki.php');
require_once(DOKU_INC.'inc/io.php');
require_once(DOKU_INC.'inc/utf8.php');
require_once(DOKU_INC.'inc/parserutils.php');
/**
* based upon class.search_indexer_phpcms.php::index_entry
*/
function idx_getPageWords($id){
$body = rawWiki($id);
$body = utf8_stripspecials($body,' ','._\-:');
$body = utf8_strtolower($body);
$body = trim($body);
$words = explode(' ',$body);
sort($words);
$index = array(); //resulting index
$old = '';
$doit = true;
$pos = 0;
//compact wordlist FIXME check for stopwords
foreach($words as $word){
if(strlen($word) == 0) continue;
// it's the same word
if($word == $old){
if($doit == false) {
// we didn't wanted it last time
continue;
}
// just increase the counter
$index[$word]++;
continue;
}
// rememember old word
$old = $word;
$doit = true;
// checking minimum word-size (excepting numbers)
if(!is_numeric($word)) {
if(strlen($word) < 3) { #FIXME add config option for max wordsize
$doit = false;
continue;
}
}
//FIXME add stopword check
// add to index
$index[$word] = 1;
}
return $index;
}
//Setup VIM: ex: et ts=4 enc=utf-8 :
...@@ -294,10 +294,11 @@ function utf8_deaccent($string,$case=0){ ...@@ -294,10 +294,11 @@ function utf8_deaccent($string,$case=0){
* stripped chars (they are not included in $UTF8_SPECIAL_CHARS) * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
* *
* @author Andreas Gohr <andi@splitbrain.org> * @author Andreas Gohr <andi@splitbrain.org>
* @param string $string The UTF8 string to strip of special chars * @param string $string The UTF8 string to strip of special chars
* @param string $repl Replace special with this string * @param string $repl Replace special with this string
* @param string $additional Additional chars to strip (used in regexp char class)
*/ */
function utf8_stripspecials($string,$repl=''){ function utf8_stripspecials($string,$repl='',$additional=''){
global $UTF8_SPECIAL_CHARS; global $UTF8_SPECIAL_CHARS;
static $specials = null; static $specials = null;
...@@ -305,7 +306,7 @@ function utf8_stripspecials($string,$repl=''){ ...@@ -305,7 +306,7 @@ function utf8_stripspecials($string,$repl=''){
$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
} }
return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string); return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
} }
/** /**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment