Skip to content
Snippets Groups Projects
Commit 6b06b652 authored by chris's avatar chris
Browse files

backlinks fixes (bugs #795 & #937)

- add deaccented and romanised page names to index word list
- remove stop words from tokens used in backlink search

darcs-hash:20061105195453-9b6ab-6c4989eb75782af60a3de3bddbc99a83de2b4c80.gz
parent 67cf9a09
No related branches found
No related tags found
No related merge requests found
......@@ -105,12 +105,14 @@ function ft_pageSearch($query,&$poswords){
*/
function ft_backlinks($id){
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
$stopwords = @file_exists($swfile) ? file($swfile) : array();
$result = array();
// quick lookup of the pagename
$page = noNS($id);
$sw = array(); // we don't use stopwords here
$matches = idx_lookup(idx_tokenizer($page,$sw)); // pagename may contain specials (_ or .)
$matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
$docs = array_keys(ft_resultCombine(array_values($matches)));
$docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
if(!count($docs)) return $result;
......@@ -119,7 +121,7 @@ function ft_backlinks($id){
// check metadata for matching links
foreach($docs as $match){
// metadata relation reference links are already resolved
$links = p_get_metadata($match,"relation references");
$links = p_get_metadata($match,'relation references');
if (isset($links[$id])) $result[] = $match;
}
......
......@@ -48,6 +48,21 @@ function idx_getPageWords($page){
$tokens = explode(' ', $body);
$tokens = array_count_values($tokens); // count the frequency of each token
// ensure the deaccented or romanised page names of internal links are added to the token array
// (this is necessary for the backlink function -- there maybe a better way!)
if ($conf['deaccent']) {
$links = p_get_metadata($page,'relation references');
$tmp = join(' ',array_keys($links)); // make a single string
$tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space
$link_tokens = array_unique(explode(' ', $tmp)); // break into tokens
foreach ($link_tokens as $link_token) {
if (isset($tokens[$link_token])) continue;
$tokens[$link_token] = 1;
}
}
$words = array();
foreach ($tokens as $word => $count) {
// simple filter to restrict use of utf8_stripspecials
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment