From 60c15d7deb9c53bcb1cf7881f441744bb29a6b63 Mon Sep 17 00:00:00 2001
From: Andreas Gohr <andi@splitbrain.org>
Date: Fri, 15 Feb 2008 18:46:53 +0100
Subject: [PATCH] better highlighting for phrase searches FS#1193

This patch makes the highlighting of phrases in search snippets and on
the pages itself much better.

Now a regexp gets passed to the ?s

darcs-hash:20080215174653-7ad00-cd2d6f7d408db7b7dd3cb9974c3eb27f3a9baeac.gz
---
 inc/fulltext.php | 27 ++++++++++++++++-----------
 inc/html.php     | 19 +++++++++----------
 inc/indexer.php  |  2 +-
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/inc/fulltext.php b/inc/fulltext.php
index 3131b7433..b10cbde8e 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -23,10 +23,11 @@ function ft_preg_quote_cb($string){
  * Returns a list of matching documents for the given query
  *
  */
-function ft_pageSearch($query,&$poswords){
+function ft_pageSearch($query,&$regex){
     $q = ft_queryParser($query);
-    // use this for higlighting later:
-    $poswords = str_replace('*','',join(' ',$q['and']));
+
+    // remember for hilighting later
+    $regex = str_replace('*','',join('|',$q['words']));
 
     // lookup all words found in the query
     $words  = array_merge($q['and'],$q['not']);
@@ -78,6 +79,9 @@ function ft_pageSearch($query,&$poswords){
         //build a regexp
         $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
         $q['phrases'] = array_map('ft_preg_quote_cb',$q['phrases']);
+        // use this for higlighting later:
+        if($regex !== '') $regex .= '|';
+        $regex .= join('|',$q['phrases']);
         // check the source of all documents for the exact phrases
         foreach(array_keys($docs) as $id){
             $text  = utf8_strtolower(rawWiki($id));
@@ -196,18 +200,15 @@ function ft_pageLookup($id,$pageonly=true){
  *
  * @author Andreas Gohr <andi@splitbrain.org>
  */
-function ft_snippet($id,$poswords){
-    $poswords = preg_quote($poswords,'#');
-    $re       = '('.str_replace(' ','|',$poswords).')';
+function ft_snippet($id,$re){
     $text     = rawWiki($id);
-
     $match = array();
     $snippets = array();
     $utf8_offset = $offset = $end = 0;
     $len = utf8_strlen($text);
 
     for ($cnt=3; $cnt--;) {
-      if (!preg_match('#'.$re.'#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
+      if (!preg_match('#('.$re.')#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
 
       list($str,$idx) = $match[0];
 
@@ -258,7 +259,7 @@ function ft_snippet($id,$poswords){
     }
 
     $m = "\1";
-    $snippets = preg_replace('#'.$re.'#iu',$m.'$1'.$m,$snippets);
+    $snippets = preg_replace('#('.$re.')#iu',$m.'$1'.$m,$snippets);
     $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
 
     return $snippet;
@@ -314,6 +315,7 @@ function ft_queryParser($query){
     $q['query']   = $query;
     $q['ns']      = array();
     $q['phrases'] = array();
+    $q['words']   = array();
     $q['and']     = array();
     $q['not']     = array();
 
@@ -337,12 +339,15 @@ function ft_queryParser($query){
             if(count($token)) $q['not'] = array_merge($q['not'],$token);
         }else{
             // asian "words" need to be searched as phrases
-            if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
+            if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){
                 $q['phrases'] = array_merge($q['phrases'],$matches[1]);
 
             }
             $token = idx_tokenizer($w,$stopwords,true);
-            if(count($token)) $q['and'] = array_merge($q['and'],$token);
+            if(count($token)){
+                $q['and']   = array_merge($q['and'],$token);
+                $q['words'] = array_merge($q['words'],$token);
+            }
         }
     }
 
diff --git a/inc/html.php b/inc/html.php
index caf52b85c..148d111aa 100644
--- a/inc/html.php
+++ b/inc/html.php
@@ -262,13 +262,12 @@ function html_draft(){
  * @author Andreas Gohr <andi@splitbrain.org>
  * @author Harry Fuecks <hfuecks@gmail.com>
  */
-function html_hilight($html,$query){
-  //split at common delimiters
-  $queries = preg_split ('/[\s\'"\\\\`()\]\[?:!\.{};,#+*<>\\/]+/',$query,-1,PREG_SPLIT_NO_EMPTY);
-  foreach ($queries as $q){
-     $q = preg_quote($q,'/');
-     $html = preg_replace_callback("/((<[^>]*)|$q)/i",'html_hilight_callback',$html);
-  }
+function html_hilight($html,$regex){
+  // strip everything that's special except pipes:
+  $regex = preg_replace('![\[\]()/\\\\?\.+*]+!','',$regex);
+
+  if ($regex === '') return $html;
+  $html = preg_replace_callback("/((<[^>]*)|$regex)/i",'html_hilight_callback',$html);
   return $html;
 }
 
@@ -343,15 +342,15 @@ function html_search(){
   flush();
 
   //do fulltext search
-  $data = ft_pageSearch($QUERY,$poswords);
+  $data = ft_pageSearch($QUERY,$regex);
   if(count($data)){
     $num = 1;
     foreach($data as $id => $cnt){
       print '<div class="search_result">';
-      print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$poswords);
+      print html_wikilink(':'.$id,$conf['useheading']?NULL:$id,$regex);
       print ': <span class="search_cnt">'.$cnt.' '.$lang['hits'].'</span><br />';
       if($num < 15){ // create snippets for the first number of matches only #FIXME add to conf ?
-        print '<div class="search_snippet">'.ft_snippet($id,$poswords).'</div>';
+        print '<div class="search_snippet">'.ft_snippet($id,$regex).'</div>';
       }
       print '</div>';
       flush();
diff --git a/inc/indexer.php b/inc/indexer.php
index 12e774579..ff2d332dc 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -615,7 +615,7 @@ function idx_tokenizer($string,&$stopwords,$wc=false){
 
     if(preg_match('/[^0-9A-Za-z]/u', $string)){
         // handle asian chars as single words (may fail on older PHP version)
-        $asia = @preg_replace('/('.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')/u',' \1 ',$string);
+        $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
         if(!is_null($asia)) $string = $asia; //recover from regexp failure
 
         $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
-- 
GitLab