From 60c15d7deb9c53bcb1cf7881f441744bb29a6b63 Mon Sep 17 00:00:00 2001 From: Andreas Gohr <andi@splitbrain.org> Date: Fri, 15 Feb 2008 18:46:53 +0100 Subject: [PATCH] better highlighting for phrase searches FS#1193 This patch makes the highlighting of phrases in search snippets and on the pages itself much better. Now a regexp gets passed to the ?s darcs-hash:20080215174653-7ad00-cd2d6f7d408db7b7dd3cb9974c3eb27f3a9baeac.gz --- inc/fulltext.php | 27 ++++++++++++++++----------- inc/html.php | 19 +++++++++---------- inc/indexer.php | 2 +- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 3131b7433..b10cbde8e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -23,10 +23,11 @@ function ft_preg_quote_cb($string){ * Returns a list of matching documents for the given query * */ -function ft_pageSearch($query,&$poswords){ +function ft_pageSearch($query,&$regex){ $q = ft_queryParser($query); - // use this for higlighting later: - $poswords = str_replace('*','',join(' ',$q['and'])); + + // remember for hilighting later + $regex = str_replace('*','',join('|',$q['words'])); // lookup all words found in the query $words = array_merge($q['and'],$q['not']); @@ -78,6 +79,9 @@ function ft_pageSearch($query,&$poswords){ //build a regexp $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']); + // use this for higlighting later: + if($regex !== '') $regex .= '|'; + $regex .= join('|',$q['phrases']); // check the source of all documents for the exact phrases foreach(array_keys($docs) as $id){ $text = utf8_strtolower(rawWiki($id)); @@ -196,18 +200,15 @@ function ft_pageLookup($id,$pageonly=true){ * * @author Andreas Gohr <andi@splitbrain.org> */ -function ft_snippet($id,$poswords){ - $poswords = preg_quote($poswords,'#'); - $re = '('.str_replace(' ','|',$poswords).')'; +function ft_snippet($id,$re){ $text = rawWiki($id); - $match = array(); $snippets = array(); $utf8_offset = $offset = $end = 0; $len = utf8_strlen($text); for ($cnt=3; $cnt--;) { - if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; + if (!preg_match('#('.$re.')#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break; list($str,$idx) = $match[0]; @@ -258,7 +259,7 @@ function ft_snippet($id,$poswords){ } $m = "\1"; - $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets); + $snippets = preg_replace('#('.$re.')#iu',$m.'$1'.$m,$snippets); $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); return $snippet; @@ -314,6 +315,7 @@ function ft_queryParser($query){ $q['query'] = $query; $q['ns'] = array(); $q['phrases'] = array(); + $q['words'] = array(); $q['and'] = array(); $q['not'] = array(); @@ -337,12 +339,15 @@ function ft_queryParser($query){ if(count($token)) $q['not'] = array_merge($q['not'],$token); }else{ // asian "words" need to be searched as phrases - if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){ + if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){ $q['phrases'] = array_merge($q['phrases'],$matches[1]); } $token = idx_tokenizer($w,$stopwords,true); - if(count($token)) $q['and'] = array_merge($q['and'],$token); + if(count($token)){ + $q['and'] = array_merge($q['and'],$token); + $q['words'] = array_merge($q['words'],$token); + } } } diff --git a/inc/html.php b/inc/html.php index caf52b85c..148d111aa 100644 --- a/inc/html.php +++ b/inc/html.php @@ -262,13 +262,12 @@ function html_draft(){ * @author Andreas Gohr <andi@splitbrain.org> * @author Harry Fuecks <hfuecks@gmail.com> */ -function html_hilight($html,$query){ - //split at common delimiters - $queries = preg_split ('/[\s\'"\\\\`()\]\[?:!\.{};,#+*<>\\/]+/',$query,-1,PREG_SPLIT_NO_EMPTY); - foreach ($queries as $q){ - $q = preg_quote($q,'/'); - $html = preg_replace_callback("/((<[^>]*)|$q)/i",'html_hilight_callback',$html); - } +function html_hilight($html,$regex){ + // strip everything that's special except pipes: + $regex = preg_replace('![\[\]()/\\\\?\.+*]+!','',$regex); + + if ($regex === '') return $html; + $html = preg_replace_callback("/((<[^>]*)|$regex)/i",'html_hilight_callback',$html); return $html; } @@ -343,15 +342,15 @@ function html_search(){ flush(); //do fulltext search - $data = ft_pageSearch($QUERY,$poswords); + $data = ft_pageSearch($QUERY,$regex); if(count($data)){ $num = 1; foreach($data as $id => $cnt){ print '<div class="search_result">'; - print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$poswords); + print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$regex); print ': <span class="search_cnt">'.$cnt.' '.$lang['hits'].'</span><br />'; if($num < 15){ // create snippets for the first number of matches only #FIXME add to conf ? - print '<div class="search_snippet">'.ft_snippet($id,$poswords).'</div>'; + print '<div class="search_snippet">'.ft_snippet($id,$regex).'</div>'; } print '</div>'; flush(); diff --git a/inc/indexer.php b/inc/indexer.php index 12e774579..ff2d332dc 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -615,7 +615,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){ if(preg_match('/[^0-9A-Za-z]/u', $string)){ // handle asian chars as single words (may fail on older PHP version) - $asia = @preg_replace('/('.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')/u',' \1 ',$string); + $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); if(!is_null($asia)) $string = $asia; //recover from regexp failure $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); -- GitLab