From 6b06b65228c9fbd6e8e45658458b14a0e8c2cdfc Mon Sep 17 00:00:00 2001
From: chris <chris@jalakai.co.uk>
Date: Sun, 5 Nov 2006 20:54:53 +0100
Subject: [PATCH] backlinks fixes (bugs #795 & #937)

- add deaccented and romanised page names to index word list
- remove stop words from tokens used in backlink search

darcs-hash:20061105195453-9b6ab-6c4989eb75782af60a3de3bddbc99a83de2b4c80.gz
---
 inc/fulltext.php |  8 +++++---
 inc/indexer.php  | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/inc/fulltext.php b/inc/fulltext.php
index 0d3acb609..448f72248 100644
--- a/inc/fulltext.php
+++ b/inc/fulltext.php
@@ -105,12 +105,14 @@ function ft_pageSearch($query,&$poswords){
  */
 function ft_backlinks($id){
     global $conf;
+    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
+    $stopwords = @file_exists($swfile) ? file($swfile) : array();
+
     $result = array();
 
     // quick lookup of the pagename
     $page    = noNS($id);
-    $sw      = array(); // we don't use stopwords here
-    $matches = idx_lookup(idx_tokenizer($page,$sw));  // pagename may contain specials (_ or .)
+    $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
     $docs    = array_keys(ft_resultCombine(array_values($matches)));
     $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
     if(!count($docs)) return $result;
@@ -119,7 +121,7 @@ function ft_backlinks($id){
     // check metadata for matching links
     foreach($docs as $match){
         // metadata relation reference links are already resolved
-        $links = p_get_metadata($match,"relation references");
+        $links = p_get_metadata($match,'relation references');
         if (isset($links[$id])) $result[] = $match;
     }
 
diff --git a/inc/indexer.php b/inc/indexer.php
index a2b7a0637..e6550c2e4 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -48,6 +48,21 @@ function idx_getPageWords($page){
     $tokens = explode(' ', $body);
     $tokens = array_count_values($tokens);   // count the frequency of each token
 
+// ensure the deaccented or romanised page names of internal links are added to the token array
+// (this is necessary for the backlink function -- there maybe a better way!)
+    if ($conf['deaccent']) {
+      $links = p_get_metadata($page,'relation references');
+
+      $tmp = join(' ',array_keys($links));                // make a single string
+      $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
+      $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
+
+      foreach ($link_tokens as $link_token) {
+        if (isset($tokens[$link_token])) continue;
+        $tokens[$link_token] = 1;
+      }
+    }
+
     $words = array();
     foreach ($tokens as $word => $count) {
         // simple filter to restrict use of utf8_stripspecials
-- 
GitLab