From 60e91a171860bce870e3b3e9109d1313ed6bc071 Mon Sep 17 00:00:00 2001 From: Andreas Gohr <gohr@cosmocode.de> Date: Mon, 26 Oct 2009 10:23:59 +0100 Subject: [PATCH] added FULLTEXT_SNIPPET_CREATE event Ignore-this: a0ebcdd129f4256e4be029e7fdf7ca45 darcs-hash:20091026092359-6e07b-4c41896825e091a3c8fbbeadc3bc7764d0735bf6.gz --- inc/fulltext.php | 153 ++++++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 69 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 06834f5ae..afb15528e 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -278,81 +278,96 @@ function ft_pagesorter($a, $b){ * Creates a snippet extract * * @author Andreas Gohr <andi@splitbrain.org> + * @triggers FULLTEXT_SNIPPET_CREATE */ function ft_snippet($id,$highlight){ - $text = rawWiki($id); - $match = array(); - $snippets = array(); - $utf8_offset = $offset = $end = 0; - $len = utf8_strlen($text); - - // build a regexp from the phrases to highlight - $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')'; - $re2 = "$re1.{0,75}(?!\\1)$re1"; - $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; - - for ($cnt=4; $cnt--;) { - if (0) { - } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { - } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { - } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { - } else { - break; - } - - list($str,$idx) = $match[0]; - - // convert $idx (a byte offset) into a utf8 character offset - $utf8_idx = utf8_strlen(substr($text,0,$idx)); - $utf8_len = utf8_strlen($str); - - // establish context, 100 bytes surrounding the match string - // first look to see if we can go 100 either side, - // then drop to 50 adding any excess if the other side can't go to 50, - $pre = min($utf8_idx-$utf8_offset,100); - $post = min($len-$utf8_idx-$utf8_len,100); - - if ($pre>50 && $post>50) { - $pre = $post = 50; - } else if ($pre>50) { - $pre = min($pre,100-$post); - } else if ($post>50) { - $post = min($post, 100-$pre); - } else { - // both are less than 50, means the context is the whole string - // make it so and break out of this loop - there is no need for the - // complex snippet calculations - $snippets = array($text); - break; - } - - // establish context start and end points, try to append to previous - // context if possible - $start = $utf8_idx - $pre; - $append = ($start < $end) ? $end : false; // still the end of the previous context snippet - $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context + $text = rawWiki($id); + $evdata = array( + 'id' => $id, + 'text' => &$text, + 'highlight' => &$highlight, + 'snippet' => '', + ); + + $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata); + if ($evt->advise_before()) { + $match = array(); + $snippets = array(); + $utf8_offset = $offset = $end = 0; + $len = utf8_strlen($text); + + // build a regexp from the phrases to highlight + $re1 = '('.join('|',array_map('preg_quote_cb',array_filter((array) $highlight))).')'; + $re2 = "$re1.{0,75}(?!\\1)$re1"; + $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; + + for ($cnt=4; $cnt--;) { + if (0) { + } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { + } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { + } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) { + } else { + break; + } + + list($str,$idx) = $match[0]; + + // convert $idx (a byte offset) into a utf8 character offset + $utf8_idx = utf8_strlen(substr($text,0,$idx)); + $utf8_len = utf8_strlen($str); + + // establish context, 100 bytes surrounding the match string + // first look to see if we can go 100 either side, + // then drop to 50 adding any excess if the other side can't go to 50, + $pre = min($utf8_idx-$utf8_offset,100); + $post = min($len-$utf8_idx-$utf8_len,100); + + if ($pre>50 && $post>50) { + $pre = $post = 50; + } else if ($pre>50) { + $pre = min($pre,100-$post); + } else if ($post>50) { + $post = min($post, 100-$pre); + } else { + // both are less than 50, means the context is the whole string + // make it so and break out of this loop - there is no need for the + // complex snippet calculations + $snippets = array($text); + break; + } + + // establish context start and end points, try to append to previous + // context if possible + $start = $utf8_idx - $pre; + $append = ($start < $end) ? $end : false; // still the end of the previous context snippet + $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context + + if ($append) { + $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); + } else { + $snippets[] = utf8_substr($text,$start,$end-$start); + } + + // set $offset for next match attempt + // substract strlen to avoid splitting a potential search success, + // this is an approximation as the search pattern may match strings + // of varying length and it will fail if the context snippet + // boundary breaks a matching string longer than the current match + $utf8_offset = $utf8_idx + $post; + $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); + $offset = utf8_correctIdx($text,$offset); + } - if ($append) { - $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append); - } else { - $snippets[] = utf8_substr($text,$start,$end-$start); - } + $m = "\1"; + $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets); + $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); - // set $offset for next match attempt - // substract strlen to avoid splitting a potential search success, - // this is an approximation as the search pattern may match strings - // of varying length and it will fail if the context snippet - // boundary breaks a matching string longer than the current match - $utf8_offset = $utf8_idx + $post; - $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post)); - $offset = utf8_correctIdx($text,$offset); + $evdata['snippet'] = $snippet; } + $evt->advise_after(); + unset($evt); - $m = "\1"; - $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets); - $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets))); - - return $snippet; + return $evdata['snippet']; } /** -- GitLab