Skip to content
Snippets Groups Projects
Commit f5eb7cf0 authored by Andreas Gohr's avatar Andreas Gohr
Browse files

new fulltext search function using the index

The new search function was added but is not yet integrated into
DokuWikis interface.

darcs-hash:20050828152821-7ad00-a6e79a9dc5aaf41c547cf42dccdbc3b5bc8d303e.gz
parent 488dd6ce
No related branches found
No related tags found
No related merge requests found
<?php
/**
* DokuWiki fulltextsearch functions using the index
*
* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
* @author Andreas Gohr <andi@splitbrain.org>
*/
if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
require_once(DOKU_INC.'inc/indexer.php');
/**
* The fulltext search
*
* Returns a list of matching documents for the given query
*/
function ft_pageSearch($query){
$q = ft_queryParser($query);
// lookup all words found in the query
$words = array_merge($q['and'],$q['not']);
foreach($q['phrases'] as $phrase){
$words = array_merge($words,$phrase['words']);
}
if(!count($words)) return array();
$result = idx_lookup($words);
// merge search results with query
foreach($q['and'] as $pos => $w){
$q['and'][$pos] = $result[$w];
}
// create a list of unwanted docs
$not = array();
foreach($q['not'] as $pos => $w){
$not = array_merge($not,array_keys($result[$w]));
}
// combine and words
if(count($q['and']) > 1){
$docs = ft_resultCombine($q['and']);
}else{
$docs = $q['and'][0];
}
if(!count($docs)) return array();
// remove negative matches
foreach($not as $n){
unset($docs[$n]);
}
if(!count($docs)) return array();
// handle phrases
if(count($q['phrases'])){
//build a regexp
$q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
$q['phrases'] = array_map('preg_quote',$q['phrases']);
$regex = '('.join('|',$q['phrases']).')';
// check the source of all documents for the exact phrases
foreach(array_keys($docs) as $id){
$text = utf8_strtolower(rawWiki($id));
if(!preg_match_all('/'.$regex.'/usi',$text)){
unset($docs[$id]); // no hit - remove
}
}
}
if(!count($docs)) return array();
// if there are any hits left, sort them by count
arsort($docs);
return $docs;
}
/**
* Combine found documents and sum up their scores
*
* This function is used to combine searched words with a logical
* AND. Only documents available in all arrays are returned.
*
* based upon PEAR's PHP_Compat function for array_intersect_key()
*
* @param array $args An array of page arrays
*/
function ft_resultCombine($args){
$array_count = count($args);
$result = array();
foreach ($args[0] as $key1 => $value1) {
for ($i = 1; $i !== $array_count; $i++) {
foreach ($args[$i] as $key2 => $value2) {
if ((string) $key1 === (string) $key2) {
if(!isset($result[$key1])) $result[$key1] = $value1;
$result[$key1] += $value2;
}
}
}
}
return $result;
}
/**
* Builds an array of search words from a query
*
* @todo support OR and parenthesises?
*/
function ft_queryParser($query){
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
if(@file_exists($swfile)){
$stopwords = file($swfile);
}else{
$stopwords = array();
}
$q = array();
$q['query'] = $query;
$q['phrases'] = array();
$q['and'] = array();
$q['not'] = array();
// handle phrase searches
while(preg_match('/"(.*?)"/',$query,$match)){
$q['phrases'][] = $match[0];
$q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
$query = preg_replace('/"(.*?)"/','',$query,1);
}
$words = explode(' ',$query);
foreach($words as $w){
if($w{0} == '-'){
$token = idx_tokenizer($w,$stopwords);
if(count($token)) $q['not'] = array_merge($q['not'],$token);
}else{
$token = idx_tokenizer($w,$stopwords);
if(count($token)) $q['and'] = array_merge($q['and'],$token);
}
}
return $q;
}
......@@ -191,9 +191,6 @@ function idx_writeIndexLine($fh,$line,$pid,$count){
* Takes an array of word and will return a list of matching
* documents for each one.
*
* It returns an array using the same index as the input
* array. Returns false if something went wrong.
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
function idx_lookup($words){
......@@ -207,21 +204,17 @@ function idx_lookup($words){
// get word IDs
$wids = array();
$pos = 0;
foreach($words as $word){
//FIXME words should be cleaned here as in getPageWords
$wid = array_search("$word\n",$word_idx);
if(is_int($wid)){
$wids[] = $wid;
$result[$pos]['wordid'] = $wid;
$result[$word] = $wid;
}else{
$result[$word] = array();
}
$result[$pos]['word'] = $word;
$pos++;
}
sort($wids);
$wids = array_unique($wids);
// Open index
$idx = fopen($conf['cachedir'].'/index.idx','r');
......@@ -256,15 +249,14 @@ function idx_lookup($words){
}
fclose($idx);
// merge docs into results
$count = count($result);
for($i=0; $i<$count; $i++){
if(isset($result[$i]['wordid'])){
$result[$i]['pages'] = $docs[$result[$i]['wordid']];
// merge found pages into result array
foreach(array_keys($result) as $word){
if(is_int($result[$word])){
$result[$word] = $docs[$result[$word]];
}
}
dbg($result);
return $result;
}
/**
......@@ -281,7 +273,7 @@ function idx_parseIndexLine(&$page_idx,$line){
$result = array();
$line = trim($line);
if($line == '') return;
if($line == '') return $result;
$parts = explode(':',$line);
foreach($parts as $part){
......@@ -298,4 +290,33 @@ function idx_parseIndexLine(&$page_idx,$line){
return $result;
}
/**
* Tokenizes a string into an array of search words
*
* Uses the same algorithm as idx_getPageWords()
*
* @todo make combined function to use alone or in getPageWords
*/
function idx_tokenizer($string,&$stopwords){
$words = array();
if(preg_match('/[^0-9A-Za-z]/u', $string)){
$arr = explode(' ', utf8_stripspecials($string,' ','._\-:'));
foreach ($arr as $w) {
if (!is_numeric($w) && strlen($w) < 3) continue;
$w = utf8_strtolower($w);
if(is_int(array_search("$w\n",$stopwords))) continue;
$words[] = $w;
}
}else{
$w = $string;
if (!is_numeric($w) && strlen($w) < 3) return $words;
$w = strtolower($w);
if(is_int(array_search("$w\n",$stopwords))) return $words;
$words[] = $w;
}
return $words;
}
//Setup VIM: ex: et ts=4 enc=utf-8 :
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment