diff --git a/inc/indexer.php b/inc/indexer.php index c5faa5756f541a2e606e2ed535625819404a2d63..7ca870526be6c7405db2178fdee7c573d6d7779b 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -37,19 +37,20 @@ function idx_getPageWords($page){ $words = array(); foreach ($tokens as $word => $count) { - $word = utf8_strtolower($word); // simple filter to restrict use of utf8_stripspecials - if (preg_match('/\W/', $word)) { + if (preg_match('/[^0-9A-Za-z]/u', $word)) { $arr = explode(' ', utf8_stripspecials($word,' ','._\-:')); $arr = array_count_values($arr); foreach ($arr as $w => $c) { if (!is_numeric($w) && strlen($w) < 3) continue; + $w = utf8_strtolower($w); $words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0); } } else { if (!is_numeric($w) && strlen($w) < 3) continue; + $word = strtolower($word); $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0); } } diff --git a/inc/lang/en/stopwords.txt b/inc/lang/en/stopwords.txt index 478fb33eff0c0cab2ed916b4c162d77bf491264f..bc6eb48aea08f241e220bc59d6d6851acfb8e381 100644 --- a/inc/lang/en/stopwords.txt +++ b/inc/lang/en/stopwords.txt @@ -12,6 +12,7 @@ their com for from +into how that the