From bc54ab520f404e26a95af051e9082aa8fad07d98 Mon Sep 17 00:00:00 2001
From: chris <chris@teacherscpd.co.uk>
Date: Tue, 16 Aug 2005 05:24:08 +0200
Subject: [PATCH] indexer improvements & fix for underscores

darcs-hash:20050816032408-50fdc-6e41585c9b97d70a218877b8ad169df9117d9965.gz
---
 inc/indexer.php           | 5 +++--
 inc/lang/en/stopwords.txt | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/inc/indexer.php b/inc/indexer.php
index c5faa5756..7ca870526 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -37,19 +37,20 @@ function idx_getPageWords($page){
     
     $words = array();
     foreach ($tokens as $word => $count) {
-        $word = utf8_strtolower($word);
 
         // simple filter to restrict use of utf8_stripspecials 
-        if (preg_match('/\W/', $word)) {
+        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
             $arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
             $arr = array_count_values($arr);
             
             foreach ($arr as $w => $c) {
                 if (!is_numeric($w) && strlen($w) < 3) continue;
+    		    $w = utf8_strtolower($w);
                 $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
             }
         } else {
             if (!is_numeric($w) && strlen($w) < 3) continue;
+	        $word = strtolower($word);
             $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
         }
     }
diff --git a/inc/lang/en/stopwords.txt b/inc/lang/en/stopwords.txt
index 478fb33ef..bc6eb48ae 100644
--- a/inc/lang/en/stopwords.txt
+++ b/inc/lang/en/stopwords.txt
@@ -12,6 +12,7 @@ their
 com
 for
 from
+into
 how
 that
 the
-- 
GitLab