// arrive here with $words = array(word => frequency)
$index = array(); //resulting index
foreach ($words as $word => $freq) {
if (is_int(array_search("$word\n",$stopwords))) continue;
$wid = array_search("$word\n",$word_idx);
if(!is_int($wid)){
$word_idx[] = "$word\n";
$wid = count($word_idx)-1;
}
$index[$wid] = $freq;
}
/**
* Split a page into words
*
* Returns an array of of word counts, false if an error occured
*
* @author Andreas Gohr <andi@splitbrain.org>
* @author Christopher Smith <chris@jalakai.co.uk>
*/
function idx_getPageWords($page){
global $conf;
$word_idx = file($conf['cachedir'].'/word.idx');
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
if(@file_exists($swfile)){
$stopwords = file($swfile);
}else{
$stopwords = array();
}
$body = rawWiki($page);
$body = strtr($body, "\r\n\t", ' ');
$tokens = explode(' ', $body);
$tokens = array_count_values($tokens); // count the frequency of each token
// ensure the deaccented or romanised page names of internal links are added to the token array
// (this is necessary for the backlink function -- there maybe a better way!)
if ($conf['deaccent']) {
$links = p_get_metadata($page,'relation references');
$tmp = join(' ',array_keys($links)); // make a single string
$tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space
$link_tokens = array_unique(explode(' ', $tmp)); // break into tokens
foreach ($link_tokens as $link_token) {
if (isset($tokens[$link_token])) continue;
$tokens[$link_token] = 1;
}
}
$words = array();
foreach ($tokens as $word => $count) {
// simple filter to restrict use of utf8_stripspecials
if (preg_match('/[^0-9A-Za-z]/u', $word)) {
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
if(!is_null($asia)) $word = $asia; //recover from regexp failure
$arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
$arr = array_count_values($arr);
foreach ($arr as $w => $c) {
if (!is_numeric($w) && strlen($w) < 3) continue;
$w = utf8_strtolower($w);
$words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
}
} else {
if (!is_numeric($word) && strlen($word) < 3) continue;
$word = strtolower($word);
$words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
// arrive here with $words = array(word => frequency)
$index = array(); //resulting index
foreach ($words as $word => $freq) {
if (is_int(array_search("$word\n",$stopwords))) continue;
$wid = array_search("$word\n",$word_idx);
if(!is_int($wid)){
$word_idx[] = "$word\n";
$wid = count($word_idx)-1;
}
$index[$wid] = $freq;
}
// save back word index
$fh = fopen($conf['cachedir'].'/word.idx','w');
if(!$fh){
trigger_error("Failed to write word.idx", E_USER_ERROR);
return false;
}
fwrite($fh,join('',$word_idx));
fclose($fh);
return $index;
}