Created
July 15, 2020 08:02
-
-
Save bisqwit/f796ea84b91ed44d353828446ca47bdb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* Word-wrapping code from Simon’s Quest Multilingual Retranslation project | |
* Copyright © 2020 Joel Yliluoma — https://iki.fi/bisqwit/cv2fin/ | |
*/ | |
class WordWrapState | |
{ | |
// Output state | |
var $x=0,$y=0, $outcome=''; | |
// Input state | |
var $position=0; | |
// Statistics | |
var $num_blank_lines = 0; | |
var $num_lines_beginning_with_punct = 0; | |
var $maximum_line_length = 0; // Updated at nl and at end | |
var $last_line_with_content = 0; | |
var $full_lines_with_non_punctuation = 0; | |
var $num_lines_ending_with_punct = 0; // Good thing | |
var $num_lines_ending_with_punct_and_word = 0; // Bad thing | |
var $num_fullstop_paragraphs = 0; | |
function __construct($ref = null, $pos = 0) | |
{ | |
if(isset($ref)) | |
foreach($ref as $k=>$v) | |
$this->$k = $v; | |
$this->position = $pos; | |
} | |
function Errors(&$info) | |
{ | |
// Causes of penalty: | |
// Number of lines, total | |
// Number of lines beginning with punctuation | |
// Number of blank lines | |
// Variation in line lengths | |
// Width constraint exceeded | |
// Height constraint exceeded | |
$result = 0; | |
$width = $this->maximum_line_length; | |
$height = $this->last_line_with_content + 1; | |
if($width > $info[2]) $result |= 1; | |
if($height > $info[3]) $result |= 2; | |
if($this->full_lines_with_non_punctuation) $result |= 4; | |
if($this->num_lines_beginning_with_punct) $result |= 8; | |
return $result; | |
} | |
function Score(&$info) | |
{ | |
$score = 0;//$this->position * 100000; | |
$goal = $this->IsGoal($info); | |
if($goal) | |
{ | |
$this->UpdateStatsAtNl($info, $this->position-1); | |
} | |
$errors = $this->Errors($info); | |
if($errors & 3) | |
{ | |
// Constraints exceeded | |
$score -= 50000; | |
} | |
// Make sure won't generate redundant blank lines | |
$score -= 4000 * ($this->num_blank_lines - $this->num_fullstop_paragraphs); | |
if($errors & 4) | |
{ | |
$score -= 2500 * $this->full_lines_with_non_punctuation; | |
} | |
if($errors & 8) | |
{ | |
$score -= 800 * $this->num_lines_beginning_with_punct; | |
} | |
$score -= 100 * ($this->position - $this->num_lines_ending_with_punct); | |
$score -= 90 * $this->num_lines_ending_with_punct_and_word; | |
if(1) | |
{ | |
$width = $this->maximum_line_length; | |
$nlines = 0; | |
$lines = explode($info[1], $this->outcome); | |
$prev = 0; | |
$deviation = 0; | |
foreach($lines as $line) | |
{ | |
$l = strlen($line); | |
if($l) { if($prev) { ++$nlines; $deviation += abs($prev - $l); } $prev = $l; } | |
} | |
$height = $this->last_line_with_content + 1; | |
$variation = ($width && $nlines) ? ($deviation / $nlines) / $width : 0; | |
$h = $height; | |
if($h < $info[3]) $h -= $this->num_fullstop_paragraphs; | |
#$h -= $this->num_lines_ending_with_punct; | |
$score -= 18 * max($h, 0); | |
$score -= 25 * $variation; | |
} | |
return $score; | |
} | |
function IsGoal(&$info) | |
{ | |
return $this->position >= count($info[0]); | |
} | |
function Iterate(&$info, $callback) | |
{ | |
# printf("state %.4f: {$info[0][$this->position][0]} %sf\n", $this->Score($info), json_encode($this)); | |
$p = $this->position; | |
$w = $info[0][$p][1]; | |
switch($info[0][$p][0]) | |
{ | |
// word | |
case 1: | |
{ | |
// Option 1: Append word | |
$n = new WordWrapState($this, $p+1); | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: Wordwrap, but only if previous token was punctuation | |
// This is used to wrap "aaa-bbb" into "aaa-\nbbb" when no space is present | |
if($this->x > 0 && $p >= 1 && $info[0][$p-1][0] == 4) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
} | |
break; | |
} | |
// newline | |
case 2: | |
{ | |
// Option 1: Generate newline. | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
if($this->x == 0) | |
{ | |
++$n->num_blank_lines; | |
} | |
$n->num_blank_lines += strlen($w)-1; | |
$n->x = 0; | |
$n->y += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: If the previous token was a full stop, generate two newlines. | |
if($p > 0 && $info[0][$p-1][0] == 5) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->num_blank_lines += strlen($w)*2-1; | |
$n->num_fullstop_paragraphs += strlen($w)*2-1; | |
$n->x = 0; | |
$n->y += strlen($w)*2; | |
$n->outcome .= $w.$w; | |
$callback($n); | |
} | |
break; | |
} | |
// space | |
case 3: | |
{ | |
// Option 1: Append spaces, but only if next element | |
// is not newline, and if we are not in the beginning of a line. | |
$n = new WordWrapState($this, $p+1); | |
$next = null; if(!$n->IsGoal($info)) $next = $info[0][$p+1]; | |
$prev = null; if($p > 0) $prev = $info[0][$p-1]; | |
if($this->x > 0 && isset($next) && $next[0] != 2) // next one is not a newline | |
{ | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
} | |
$callback($n); | |
// Option 2: Replace spaces with a newline, but only if either | |
// - (wishful) current line contains more than one word | |
// - (mandatory) next line is word and line length would be exceeded otherwise | |
if($this->x > 0 | |
&& ( | |
(isset($prev) && $this->x > strlen($prev[1])) | |
|| (isset($next) && $next[0] != 2 && $this->x + strlen($w) + strlen($next[1]) >= $info[2]) | |
)) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$callback($n); | |
// Option 3: Replace space with two newlines, if previous token was a full stop | |
if(isset($prev) && $prev[0] == 5) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
++$n->num_blank_lines; | |
++$n->num_fullstop_paragraphs; | |
$n->x = 0; | |
$n->y += 2; | |
$n->outcome .= $info[1] . $info[1]; // two newlines | |
$callback($n); | |
} | |
} | |
break; | |
} | |
// punct | |
case 4: | |
case 5: | |
{ | |
// Option 1: Append punctuation | |
$n = new WordWrapState($this, $p+1); | |
if($n->x == 0) | |
{ | |
++$n->num_lines_beginning_with_punct; | |
} | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
// Option 2: Wordwrap, but only if the line length would exceed limits otherwise | |
if($this->x > 0 && $this->x + strlen($w) > $info[2]) | |
{ | |
$n = new WordWrapState($this, $p+1); | |
$n->UpdateStatsAtNl($info, $p-1); | |
$n->x = 0; | |
++$n->y; | |
$n->outcome .= $info[1]; // newline | |
$n->x += strlen($w); | |
$n->outcome .= $w; | |
$callback($n); | |
} | |
break; | |
} | |
} | |
} | |
function UpdateStatsAtNl(&$info, $p) | |
{ | |
$this->maximum_line_length = max($this->maximum_line_length, $this->x); | |
$l = 0; | |
if($p >= 0) $l = strlen($info[0][$p][1]); | |
if($this->x > 0) | |
{ | |
$this->last_line_with_content = max($this->last_line_with_content, $this->y); | |
// At full width, and the last item added was a word? | |
if($this->x == $info[2] && $info[0][$p][0] == 1) | |
{ | |
++$this->full_lines_with_non_punctuation; | |
} | |
// Add bonus for a line that ends in punctuation, | |
// unless the punctuation is at begin of line or the line contains just one word | |
if($p >= 0 && ($info[0][$p][0] == 4 || $info[0][$p][0] == 5)) | |
{ | |
if($p >= 1 && $info[0][$p-1][0] == 1 && $this->x > strlen($info[0][$p-1][1]) + $l) | |
++$this->num_lines_ending_with_punct; | |
else | |
$this->num_lines_ending_with_punct += 0.5; | |
} | |
// If a line ends with punctuation, space, and a word, | |
// add penalty that is inversely proportional to the word's length | |
// i.e. shorter words are more penalized | |
if($p >= 2 | |
&& ($info[0][$p-2][0] == 5 // punct | |
|| $info[0][$p-2][0] == 4 // punct | |
) | |
&& $info[0][$p-1][0] == 3 // space | |
&& $info[0][$p][0] == 1 // word | |
&& $this->x >= strlen($info[0][$p-2][1]) | |
+ strlen($info[0][$p-1][1]) | |
+ $l) | |
{ | |
$penalty = 1 / $l; | |
$this->num_lines_ending_with_punct_and_word += $penalty; | |
} | |
// If the line ends in a short word, add some penalty | |
if($p >= 0 | |
&& $info[0][$p][0] == 1 // word | |
) | |
{ | |
$penalty = 0.03 / ($l*$l); | |
$this->num_lines_ending_with_punct_and_word += $penalty; | |
} | |
} | |
} | |
}; | |
function Dijkstra($firststate, &$info) | |
{ | |
$queue = new SplPriorityQueue(); | |
$queue->setExtractFlags(SplPriorityQueue::EXTR_BOTH); | |
$queue->insert($firststate, 0); | |
$first = null; | |
while($queue->valid()) | |
{ | |
$top = $queue->extract(); | |
/*$cands = Array($top); | |
while($queue->valid() && $queue->top()['priority'] == $top['priority']) | |
{ | |
$cands[] = $queue->extract(); | |
} | |
foreach($cands as &$top) | |
{*/ | |
$state = $top['data']; | |
if($state->IsGoal($info)) | |
{ | |
# printf("state %.4f: * %sf\n", $state->Score($info), json_encode($state)); | |
if(!isset($first)) { $first = $state; } | |
break; | |
} | |
else | |
{ | |
$state->Iterate($info, function($newstate)use(&$queue,&$info) | |
{ | |
$queue->insert($newstate, $newstate->Score($info)); | |
}); | |
} | |
/*} | |
unset($top);*/ | |
} | |
return Array($first->outcome, $first->Errors($info)); | |
} | |
function CV2WordWrap($subject, $punct,$fullstop,$newline,$indent, $width,$height) | |
{ | |
$p = ''; for($a=0; $a<strlen($punct); ++$a) $p .= sprintf('\\x%02X', ord($punct[$a])); | |
$n = ''; for($a=0; $a<strlen($newline); ++$a) $n .= sprintf('\\x%02X', ord($newline[$a])); | |
$i = ''; for($a=0; $a<strlen($indent); ++$a) $i .= sprintf('\\x%02X', ord($indent[$a])); | |
$f = ''; for($a=0; $a<strlen($fullstop); ++$a) $f .= sprintf('\\x%02X', ord($fullstop[$a])); | |
$pattern = "/([^{$p}{$i}{$n}{$f}]+)|($n)|([{$i}]+)|([{$p}]+)|([{$f}]+)/"; | |
#print "pat($pattern)\n"; | |
preg_match_all($pattern, $subject, $mat); | |
$result = Array(); | |
$a = 0; | |
foreach($mat[0] as $k=>$v) | |
/**/if(strlen($mat[1][$k])) $result[$a++]=Array(1, $v); // word | |
elseif(strlen($mat[2][$k])) $result[$a++]=Array(2, $v); // newline | |
elseif(strlen($mat[3][$k])) $result[$a++]=Array(3, $v); // spaces | |
elseif(strlen($mat[4][$k])) $result[$a++]=Array(4, $v); // punctuation | |
elseif(strlen($mat[5][$k])) $result[$a++]=Array(5, $v); // full stop | |
elseif(strlen($v)) $result[$a++]=Array(0, $v); | |
// Remove trailing spaces, if any | |
for(;;) | |
{ | |
$n = count($result); | |
if(!$n) break; | |
if($result[$n-1][0] != 3) break; | |
unset($result[$n-1]); | |
} | |
#foreach($result as $p) | |
# printf("%d %s\n", $p[0], TranslateDialogTranslated($p[1])); | |
$state = new WordWrapState; | |
$info = Array($result,$newline,$width,$height, "{$p}{$f}"); | |
$result = Dijkstra($state, $info); | |
return $result; | |
} | |
?><?php | |
////////////////////////////////////////////// | |
// Test code: | |
function TranslateDialogTranslated($s) { return str_replace("\n",'[nl]',$s); } | |
//require 'inc/wordwrap.php'; | |
$subject = "[nl]If you plan to trek through a swamp, eat laurels. It neutralizes the poison."; | |
$subject = str_replace('[nl]', "\n", $subject); | |
$punct = "-,;"; | |
$punct2 = ".:!?"; | |
$newline = "\n"; | |
$indent = " "; | |
$result = CV2WordWrap($subject, $punct,$punct2, $newline,$indent, 23,6); | |
printf("Errors (bitmask): %d\n", $result[1]); | |
print TranslateDialogTranslated($result[0]); | |
print "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment