Level 1 Headline

    // arrive here with $words = array(word => frequency)

    $index = array(); //resulting index
    foreach ($words as $word => $freq) {
    if (is_int(array_search("$word\n",$stopwords))) continue;
        $wid = array_search("$word\n",$word_idx);
        if(!is_int($wid)){
            $word_idx[] = "$word\n";
            $wid = count($word_idx)-1;
        }
        $index[$wid] = $freq;
    }

/**
 * Split a page into words
 *
 * Returns an array of of word counts, false if an error occured
 *
 * @author Andreas Gohr <andi@splitbrain.org>
 * @author Christopher Smith <chris@jalakai.co.uk>
 */
function idx_getPageWords($page){
    global $conf;
    $word_idx = file($conf['cachedir'].'/word.idx');
    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
    if(@file_exists($swfile)){
        $stopwords = file($swfile);
    }else{
        $stopwords = array();
    }

    $body   = rawWiki($page);
    $body   = strtr($body, "\r\n\t", '   ');
    $tokens = explode(' ', $body);
    $tokens = array_count_values($tokens);   // count the frequency of each token

// ensure the deaccented or romanised page names of internal links are added to the token array
// (this is necessary for the backlink function -- there maybe a better way!)
    if ($conf['deaccent']) {
      $links = p_get_metadata($page,'relation references');

      $tmp = join(' ',array_keys($links));                // make a single string
      $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
      $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens

      foreach ($link_tokens as $link_token) {
        if (isset($tokens[$link_token])) continue;
        $tokens[$link_token] = 1;
      }
    }

    $words = array();
    foreach ($tokens as $word => $count) {
        // simple filter to restrict use of utf8_stripspecials
        if (preg_match('/[^0-9A-Za-z]/u', $word)) {
            // handle asian chars as single words (may fail on older PHP version)
            $asia = @preg_replace('/('.IDX_ASIAN.')/u','\1 ',$word);
            if(!is_null($asia)) $word = $asia; //recover from regexp failure
            $arr = explode(' ', utf8_stripspecials($word,' ','._\-:\*'));
            $arr = array_count_values($arr);

            foreach ($arr as $w => $c) {
                if (!is_numeric($w) && strlen($w) < 3) continue;
                $w = utf8_strtolower($w);
                $words[$w] = $c * $count + (isset($words[$w]) ? $words[$w] : 0);
            }
        } else {
            if (!is_numeric($word) && strlen($word) < 3) continue;
            $word = strtolower($word);
            $words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
        }
    }

    // arrive here with $words = array(word => frequency)

    $index = array(); //resulting index
    foreach ($words as $word => $freq) {
    if (is_int(array_search("$word\n",$stopwords))) continue;
        $wid = array_search("$word\n",$word_idx);
        if(!is_int($wid)){
            $word_idx[] = "$word\n";
            $wid = count($word_idx)-1;
        }
        $index[$wid] = $freq;
    }

    // save back word index
    $fh = fopen($conf['cachedir'].'/word.idx','w');
    if(!$fh){
        trigger_error("Failed to write word.idx", E_USER_ERROR);
        return false;
    }
    fwrite($fh,join('',$word_idx));
    fclose($fh);

    return $index;
}

Level 2 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 3 Headline

Level 3 Headline

Level 2 Headline

Level 2 Headline

 
playground/test9.txt · Last modified: 2007/01/16 09:11 by chris
 
Recent changes RSS feed Donate Powered by PHP Valid XHTML 1.0 Valid CSS Driven by DokuWiki