From b4ce25e9a449e7a6a78476bf94bca31cbc4259ce Mon Sep 17 00:00:00 2001 From: Andreas Gohr <andi@splitbrain.org> Date: Sun, 7 Aug 2005 22:33:22 +0200 Subject: [PATCH] a first step for search indexing - nothing to see yet darcs-hash:20050807203322-7ad00-6db6733f8fcd861366856635ba3d205fd3bb54da.gz --- inc/indexer.php | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ inc/utf8.php | 9 ++++--- 2 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 inc/indexer.php diff --git a/inc/indexer.php b/inc/indexer.php new file mode 100644 index 000000000..3fb710338 --- /dev/null +++ b/inc/indexer.php @@ -0,0 +1,70 @@ +<?php +/** + * Common DokuWiki functions + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Andreas Gohr <andi@splitbrain.org> + */ + + if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); + require_once(DOKU_CONF.'dokuwiki.php'); + require_once(DOKU_INC.'inc/io.php'); + require_once(DOKU_INC.'inc/utf8.php'); + require_once(DOKU_INC.'inc/parserutils.php'); + +/** + * based upon class.search_indexer_phpcms.php::index_entry + */ +function idx_getPageWords($id){ + $body = rawWiki($id); + $body = utf8_stripspecials($body,' ','._\-:'); + $body = utf8_strtolower($body); + $body = trim($body); + $words = explode(' ',$body); + sort($words); + + $index = array(); //resulting index + $old = ''; + $doit = true; + $pos = 0; + + //compact wordlist FIXME check for stopwords + + foreach($words as $word){ + if(strlen($word) == 0) continue; + + // it's the same word + if($word == $old){ + if($doit == false) { + // we didn't wanted it last time + continue; + } + // just increase the counter + $index[$word]++; + continue; + } + + // rememember old word + $old = $word; + $doit = true; + + // checking minimum word-size (excepting numbers) + if(!is_numeric($word)) { + if(strlen($word) < 3) { #FIXME add config option for max wordsize + $doit = false; + continue; + } + } + + //FIXME add stopword check + + // add to index + $index[$word] = 1; + } + + return $index; +} + + + +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/utf8.php b/inc/utf8.php index 3eb06865f..176b9f813 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -294,10 +294,11 @@ function utf8_deaccent($string,$case=0){ * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) * * @author Andreas Gohr <andi@splitbrain.org> - * @param string $string The UTF8 string to strip of special chars - * @param string $repl Replace special with this string + * @param string $string The UTF8 string to strip of special chars + * @param string $repl Replace special with this string + * @param string $additional Additional chars to strip (used in regexp char class) */ -function utf8_stripspecials($string,$repl=''){ +function utf8_stripspecials($string,$repl='',$additional=''){ global $UTF8_SPECIAL_CHARS; static $specials = null; @@ -305,7 +306,7 @@ function utf8_stripspecials($string,$repl=''){ $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); } - return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string); + return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); } /** -- GitLab