Skip to content
Snippets Groups Projects
Commit bc54ab52 authored by chris's avatar chris
Browse files

indexer improvements & fix for underscores

darcs-hash:20050816032408-50fdc-6e41585c9b97d70a218877b8ad169df9117d9965.gz
parent a99d3236
No related branches found
No related tags found
No related merge requests found
......@@ -37,19 +37,20 @@ function idx_getPageWords($page){
$words = array();
foreach ($tokens as $word => $count) {
$word = utf8_strtolower($word);
// simple filter to restrict use of utf8_stripspecials
if (preg_match('/\W/', $word)) {
if (preg_match('/[^0-9A-Za-z]/u', $word)) {
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:'));
$arr = array_count_values($arr);
foreach ($arr as $w => $c) {
if (!is_numeric($w) && strlen($w) < 3) continue;
$w = utf8_strtolower($w);
$words[$w] = $c + (isset($words[$w]) ? $words[$w] : 0);
}
} else {
if (!is_numeric($w) && strlen($w) < 3) continue;
$word = strtolower($word);
$words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
......
......@@ -12,6 +12,7 @@ their
com
for
from
into
how
that
the
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment