Last active
November 8, 2016 04:37
-
-
Save zwjzxh520/7e62a850ab8a6fd40417e4be0f2a25d2 to your computer and use it in GitHub Desktop.
最小编辑距离算法比较文本异同 Levenshtein Distance。引用自https://code.google.com/archive/p/clamdemo/downloads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/*************************************************************************** | |
* | |
* Copyright (c) 2011 log4myself.com, Inc. All Rights Reserved | |
* | |
**************************************************************************/ | |
/** | |
* @file MinumDistance.php | |
* @author caojiandong([email protected]) | |
* @date 2011/11/24 16:03:59 | |
* @brief 计算最小编辑距离类,返回输出过程 | |
* | |
**/ | |
class MinumDistance { | |
/** | |
* 计算编辑距离的核心算法 | |
* | |
* return array( | |
* 'distance' => 最小编辑距离 | |
* 'progress' => 编辑过程 | |
* ) | |
**/ | |
const MAX = 100000; | |
private function execute($src,$tgt){ | |
$progress = array(); | |
$m = count($src); | |
$n = count($tgt); | |
$d[self::MAX][self::MAX] = array(); | |
$type[self::MAX][self::MAX] = array(); | |
$back[self::MAX][self::MAX][2] = array(); | |
$i = 0; | |
$j = 0; | |
$backi = 0; | |
$backj = 0; | |
$backtype = 0; | |
for($i = 0;$i <= $m;$i ++) | |
$d[$i][0] = $i; | |
for($j = 0;$j <= $n;$j ++) | |
$d[0][$j] = $j; | |
for($j = 1;$j <= $n;$j ++){ | |
for($i = 1;$i <= $m;$i ++){ | |
if($src[$i-1] == $tgt[$j-1]){ | |
$d[$i][$j] = $d[$i-1][$j-1]; | |
$back[$i][$j][0] = $i-1; | |
$back[$i][$j][1] = $j-1; | |
$type[$i][$j] = 0; | |
}else{ | |
$d[$i][$j] = $d[$i-1][$j]+1; | |
$back[$i][$j][0] = $i-1; | |
$back[$i][$j][1] = $j; | |
$type[$i][$j] = 2; | |
if($d[$i][$j]>$d[$i][$j-1]+1){ | |
$d[$i][$j] = $d[$i][$j-1]+1; | |
$back[$i][$j][0] = $i; | |
$back[$i][$j][1] = $j-1; | |
$type[$i][$j] = 1; | |
} | |
if($d[$i][$j]>$d[$i-1][$j-1]+1){ | |
$d[$i][$j] = $d[$i-1][$j-1]+1; | |
$back[$i][$j][0] = $i-1; | |
$back[$i][$j][1] = $j-1; | |
$type[$i][$j] = 3; | |
} | |
} | |
} | |
} | |
$backi = $m; | |
$backj = $n; | |
while($backi && $backj){ | |
$backtype = $type[$backi][$backj]; | |
switch($backtype){ | |
case 0: | |
$progress['copy'][] = array($backi-1,$backj-1); | |
break; | |
case 1: | |
$progress['del'][] = $backj-1; | |
break; | |
case 2: | |
$progress['insert'][] = $backi-1; | |
break; | |
case 3: | |
$progress['replace'][] = array($backi-1,$backj-1); | |
break; | |
} | |
$i = $back[$backi][$backj][0]; | |
$j = $back[$backi][$backj][1]; | |
$backi = $i; | |
$backj = $j; | |
} | |
return array( | |
'distance' => $d[$m][$n], | |
'progress' => $progress, | |
); | |
} | |
public function getMinumDistance($arrSrc,$arrTgt){ | |
//接受string参数,如果是string,则进行stringToarray的转化 | |
if(is_string($arrSrc) && is_string($arrTgt)){ | |
$arrSrc = $this->getArrayFromString($arrSrc); | |
$arrTgt = $this->getArrayFromString($arrTgt); | |
} | |
if((!is_string($arrSrc) || !is_string($arrTgt)) && | |
(!is_array($arrSrc) || !is_array($arrTgt))) | |
return false; | |
return $this->execute($arrSrc,$arrTgt); | |
} | |
public function __construct(){ | |
} | |
//将文本分字成数组 | |
private function getArrayFromString($str){ | |
$result = array(); | |
$strLen = mb_strlen($str,"UTF8"); | |
$i = 0; | |
for($i = 0; $i < $strLen ; $i ++){ | |
$result[] = mb_substr($str,$i,1); | |
} | |
return $result; | |
} | |
} | |
//显示不同之处的示例代码 | |
$src = 'hello world 打倒旧世界'; | |
$tgt = 'heelo wolrd!开创新世界'; | |
$md = new MinumDistance(); | |
$result = $md->getMinumDistance($src, $tgt); | |
$p = $result['progress']; | |
$index = 0; | |
$srcProgress = $tgtProgress = []; | |
foreach ($p['copy'] as $value) { | |
if ($value[0] == $value[1]) { | |
$srcProgress[] = $value[0]; | |
$tgtProgress[] = $value[1]; | |
} | |
} | |
echo hightlight($src, $srcProgress). '<br />'; | |
echo hightlight($tgt, $tgtProgress). '<br />'; | |
function hightlight($str, $progress) { | |
$srcArr = preg_split('//u', $str, null, PREG_SPLIT_NO_EMPTY); | |
$j = count($srcArr); | |
foreach ($progress as $i) { | |
if ($i < $j) { | |
for ($k=$i+1; $k < $j; $k++) { | |
$srcArr[$k] = '<span style="color:red;">'.$srcArr[$k].'</span>'; | |
} | |
} | |
$j = $i; | |
} | |
return implode('', $srcArr); | |
} | |
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment