From f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694 Mon Sep 17 00:00:00 2001 From: Andreas Gohr <andi@splitbrain.org> Date: Sun, 28 Aug 2005 17:28:21 +0200 Subject: [PATCH] new fulltext search function using the index The new search function was added but is not yet integrated into DokuWikis interface. darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz --- inc/fulltext.php | 147 +++++++++++++++++++++++++++++++++++++++++++++++ inc/indexer.php | 57 ++++++++++++------ 2 files changed, 186 insertions(+), 18 deletions(-) create mode 100644 inc/fulltext.php diff --git a/inc/fulltext.php b/inc/fulltext.php new file mode 100644 index 000000000..8549a67c1 --- /dev/null +++ b/inc/fulltext.php @@ -0,0 +1,147 @@ +<?php +/** + * DokuWiki fulltextsearch functions using the index + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Andreas Gohr <andi@splitbrain.org> + */ + + if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); + require_once(DOKU_INC.'inc/indexer.php'); + + +/** + * The fulltext search + * + * Returns a list of matching documents for the given query + */ +function ft_pageSearch($query){ + $q = ft_queryParser($query); + + // lookup all words found in the query + $words = array_merge($q['and'],$q['not']); + foreach($q['phrases'] as $phrase){ + $words = array_merge($words,$phrase['words']); + } + if(!count($words)) return array(); + $result = idx_lookup($words); + + // merge search results with query + foreach($q['and'] as $pos => $w){ + $q['and'][$pos] = $result[$w]; + } + // create a list of unwanted docs + $not = array(); + foreach($q['not'] as $pos => $w){ + $not = array_merge($not,array_keys($result[$w])); + } + + + // combine and words + if(count($q['and']) > 1){ + $docs = ft_resultCombine($q['and']); + }else{ + $docs = $q['and'][0]; + } + if(!count($docs)) return array(); + + // remove negative matches + foreach($not as $n){ + unset($docs[$n]); + } + + if(!count($docs)) return array(); + + + // handle phrases + if(count($q['phrases'])){ + //build a regexp + $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); + $q['phrases'] = array_map('preg_quote',$q['phrases']); + $regex = '('.join('|',$q['phrases']).')'; + + // check the source of all documents for the exact phrases + foreach(array_keys($docs) as $id){ + $text = utf8_strtolower(rawWiki($id)); + if(!preg_match_all('/'.$regex.'/usi',$text)){ + unset($docs[$id]); // no hit - remove + } + } + } + + if(!count($docs)) return array(); + + // if there are any hits left, sort them by count + arsort($docs); + + return $docs; +} + +/** + * Combine found documents and sum up their scores + * + * This function is used to combine searched words with a logical + * AND. Only documents available in all arrays are returned. + * + * based upon PEAR's PHP_Compat function for array_intersect_key() + * + * @param array $args An array of page arrays + */ +function ft_resultCombine($args){ + $array_count = count($args); + $result = array(); + foreach ($args[0] as $key1 => $value1) { + for ($i = 1; $i !== $array_count; $i++) { + foreach ($args[$i] as $key2 => $value2) { + if ((string) $key1 === (string) $key2) { + if(!isset($result[$key1])) $result[$key1] = $value1; + $result[$key1] += $value2; + } + } + } + } + return $result; +} + +/** + * Builds an array of search words from a query + * + * @todo support OR and parenthesises? + */ +function ft_queryParser($query){ + global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + if(@file_exists($swfile)){ + $stopwords = file($swfile); + }else{ + $stopwords = array(); + } + + $q = array(); + $q['query'] = $query; + $q['phrases'] = array(); + $q['and'] = array(); + $q['not'] = array(); + + // handle phrase searches + while(preg_match('/"(.*?)"/',$query,$match)){ + $q['phrases'][] = $match[0]; + $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); + $query = preg_replace('/"(.*?)"/','',$query,1); + } + + $words = explode(' ',$query); + foreach($words as $w){ + if($w{0} == '-'){ + $token = idx_tokenizer($w,$stopwords); + if(count($token)) $q['not'] = array_merge($q['not'],$token); + }else{ + $token = idx_tokenizer($w,$stopwords); + if(count($token)) $q['and'] = array_merge($q['and'],$token); + } + } + + return $q; +} + + diff --git a/inc/indexer.php b/inc/indexer.php index fd20a4747..65ae126dd 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -191,9 +191,6 @@ function idx_writeIndexLine($fh,$line,$pid,$count){ * Takes an array of word and will return a list of matching * documents for each one. * - * It returns an array using the same index as the input - * array. Returns false if something went wrong. - * * @author Andreas Gohr <andi@splitbrain.org> */ function idx_lookup($words){ @@ -207,21 +204,17 @@ function idx_lookup($words){ // get word IDs $wids = array(); - $pos = 0; foreach($words as $word){ - - //FIXME words should be cleaned here as in getPageWords - $wid = array_search("$word\n",$word_idx); if(is_int($wid)){ $wids[] = $wid; - $result[$pos]['wordid'] = $wid; + $result[$word] = $wid; + }else{ + $result[$word] = array(); } - $result[$pos]['word'] = $word; - $pos++; } sort($wids); - + $wids = array_unique($wids); // Open index $idx = fopen($conf['cachedir'].'/index.idx','r'); @@ -256,15 +249,14 @@ function idx_lookup($words){ } fclose($idx); - // merge docs into results - $count = count($result); - for($i=0; $i<$count; $i++){ - if(isset($result[$i]['wordid'])){ - $result[$i]['pages'] = $docs[$result[$i]['wordid']]; + // merge found pages into result array + foreach(array_keys($result) as $word){ + if(is_int($result[$word])){ + $result[$word] = $docs[$result[$word]]; } } -dbg($result); + return $result; } /** @@ -281,7 +273,7 @@ function idx_parseIndexLine(&$page_idx,$line){ $result = array(); $line = trim($line); - if($line == '') return; + if($line == '') return $result; $parts = explode(':',$line); foreach($parts as $part){ @@ -298,4 +290,33 @@ function idx_parseIndexLine(&$page_idx,$line){ return $result; } +/** + * Tokenizes a string into an array of search words + * + * Uses the same algorithm as idx_getPageWords() + * + * @todo make combined function to use alone or in getPageWords + */ +function idx_tokenizer($string,&$stopwords){ + $words = array(); + + if(preg_match('/[^0-9A-Za-z]/u', $string)){ + $arr = explode(' ', utf8_stripspecials($string,' ','._\-:')); + foreach ($arr as $w) { + if (!is_numeric($w) && strlen($w) < 3) continue; + $w = utf8_strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) continue; + $words[] = $w; + } + }else{ + $w = $string; + if (!is_numeric($w) && strlen($w) < 3) return $words; + $w = strtolower($w); + if(is_int(array_search("$w\n",$stopwords))) return $words; + $words[] = $w; + } + + return $words; +} + //Setup VIM: ex: et ts=4 enc=utf-8 : -- GitLab