From 6b06b65228c9fbd6e8e45658458b14a0e8c2cdfc Mon Sep 17 00:00:00 2001 From: chris <chris@jalakai.co.uk> Date: Sun, 5 Nov 2006 20:54:53 +0100 Subject: [PATCH] backlinks fixes (bugs #795 & #937) - add deaccented and romanised page names to index word list - remove stop words from tokens used in backlink search darcs-hash:20061105195453-9b6ab-6c4989eb75782af60a3de3bddbc99a83de2b4c80.gz --- inc/fulltext.php | 8 +++++--- inc/indexer.php | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/inc/fulltext.php b/inc/fulltext.php index 0d3acb609..448f72248 100644 --- a/inc/fulltext.php +++ b/inc/fulltext.php @@ -105,12 +105,14 @@ function ft_pageSearch($query,&$poswords){ */ function ft_backlinks($id){ global $conf; + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + $stopwords = @file_exists($swfile) ? file($swfile) : array(); + $result = array(); // quick lookup of the pagename $page = noNS($id); - $sw = array(); // we don't use stopwords here - $matches = idx_lookup(idx_tokenizer($page,$sw)); // pagename may contain specials (_ or .) + $matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .) $docs = array_keys(ft_resultCombine(array_values($matches))); $docs = array_filter($docs,'isVisiblePage'); // discard hidden pages if(!count($docs)) return $result; @@ -119,7 +121,7 @@ function ft_backlinks($id){ // check metadata for matching links foreach($docs as $match){ // metadata relation reference links are already resolved - $links = p_get_metadata($match,"relation references"); + $links = p_get_metadata($match,'relation references'); if (isset($links[$id])) $result[] = $match; } diff --git a/inc/indexer.php b/inc/indexer.php index a2b7a0637..e6550c2e4 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -48,6 +48,21 @@ function idx_getPageWords($page){ $tokens = explode(' ', $body); $tokens = array_count_values($tokens); // count the frequency of each token +// ensure the deaccented or romanised page names of internal links are added to the token array +// (this is necessary for the backlink function -- there maybe a better way!) + if ($conf['deaccent']) { + $links = p_get_metadata($page,'relation references'); + + $tmp = join(' ',array_keys($links)); // make a single string + $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space + $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens + + foreach ($link_tokens as $link_token) { + if (isset($tokens[$link_token])) continue; + $tokens[$link_token] = 1; + } + } + $words = array(); foreach ($tokens as $word => $count) { // simple filter to restrict use of utf8_stripspecials -- GitLab