k47.cz    — každý den dokud se vám to nezačne líbit
foto Praha výběr povídky kultura | twitter FB


Spellcheck

10. 9. 2012 — k47
<?php

// PHP implementation of http://norvig.com/spell-correct.html in 49 lines of code
// based on http://soundofemotion.com/spellcorrect.txt which is much longer

function words($text) {
  return preg_split("~[^a-z]+~", $text, null, PREG_SPLIT_NO_EMPTY);
}

function train($features) {
  return array_count_values($features);
}

function edits1($word) {
  $edits = array();
  $alphabet = "abcdefghijklmnopqrstuvwxyz";

  // deletion
  for($x = 0; $x < strlen($word); $x++)
    $edits[] = substr($word, 0, $x) . substr($word, $x+1, strlen($word));

  // transposition
  for($x = 0; $x < strlen($word)-1; $x++)
    $edits[] = substr($word, 0, $x) . $word[$x+1] . $word[$x] . substr($word, $x+2, strlen($word));

  // alteration
  for($c = 0; $c < strlen($alphabet); $c++)
    for($x = 0; $x < strlen($word); $x++)
      $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x+1, strlen($word));

  // insertion
  for($c = 0; $c < strlen($alphabet); $c++)
    for($x = 0; $x < strlen($word) + 1; $x++)
      $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x, strlen($word));

  return array_unique($edits);
}

function known_edits2($word, $nwords) {
  $edits = array();
  foreach (edits1($word) as $e1)
    foreach (edits1($e1) as $e2)
      if (isset($nwords[$e2]))
        $edits[] = $e2;

  return array_unique($edits);
}

function known($words, $nwords) {
  return array_flip(array_intersect_key(array_flip($words), $nwords));
}

function candidates($word, $nwords) {
  if (known(array($word), $nwords))           return array($word);
  if ($known = known(edits1($word), $nwords)) return $known;
  if ($known = known_edits2($word, $nwords))  return $known;
  return array($word);
}

function correct($word, $nwords) {
  $candidates = array_flip(candidates($word, $nwords));
  foreach ($candidates as $word => &$weight)
    $weight = isset($nwords[$word]) ? $nwords[$word] : 1;
  arsort($candidates, SORT_NUMERIC);
  reset($candidates);
  return key($candidates);
}

$nwords = train(words(file_get_contents("big.txt")));

echo correct("thay", $nwords), "\n";

https://gist.github.com/3688772


vstoupit do diskuze    sdílet na facebooku, twitteru, google+

příbuzné články:
Conway's game of life
Content-aware image cropping with Scala
Sleeping patterns
Atrox\Matcher
PHP 5.4
Detekce duplicitních souborů 📷

sem odkazují:
Útok /prog/

píše k47 & hosté, ascii@k47.cz