Created
November 3, 2020 16:43
-
-
Save maximal/c9b29b4367c9b60beca5687702329a17 to your computer and use it in GitHub Desktop.
Excel writing example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Генератор текстовых диффов из XLSX-файлов | |
* с использованием редакционного расстояния Левенштейна | |
* | |
* @author MaximAL | |
* @since 2020-08-01 | |
* @date 2020-08-01 | |
* @time 4:51 | |
* | |
* @link https://maximals.ru | |
* @link https://sijeko.ru | |
*/ | |
namespace Maximal\TextDiff; | |
use Box\Spout\Common\Entity\Row; | |
use Box\Spout\Reader\Common\Creator\ReaderEntityFactory; | |
use Box\Spout\Reader\XLSX\Sheet; | |
use Box\Spout\Writer\Common\Creator\Style\StyleBuilder; | |
use Box\Spout\Writer\Common\Creator\WriterEntityFactory; | |
/** | |
* Class App | |
* @package Maximal\TextDiff | |
*/ | |
class App | |
{ | |
const DEFAULT_NUMBER = 2; | |
private $config; | |
private $argv; | |
public function __construct(array $argv, array $config = []) | |
{ | |
$this->argv = $argv; | |
$this->config = $config; | |
} | |
public function run() | |
{ | |
echo 'XLSX text diff top generator (c) MaximAL 2020', PHP_EOL; | |
$argc = count($this->argv); | |
if ($argc < 3 || $argc > 4) { | |
$this->help(); | |
return; | |
} | |
$timeStart = microtime(true); | |
// Валидация количества | |
$numberParam = $argc === 4 ? $this->argv[3] : ''; | |
if ($argc === 4 && !preg_match('/^[1-9]\d*$/', $numberParam)) { | |
echo 'Third parameter (got: ', $numberParam, ') should be a positive integer.', PHP_EOL; | |
return; | |
} | |
$number = $argc === 4 ? intval($numberParam) : self::DEFAULT_NUMBER; | |
// Валидация входного файла | |
$inputFile = $this->argv[1]; | |
if (!preg_match('/\.xlsx$/ui', $inputFile)) { | |
echo 'File must be in XLSX format only: ', $inputFile, PHP_EOL; | |
return; | |
} | |
if (!is_file($inputFile)) { | |
echo 'Input file cannot be read or does not exist: ', $inputFile, PHP_EOL; | |
return; | |
} | |
// Валидация выходного файла | |
$outputFile = $this->argv[2]; | |
if (is_file($outputFile)) { | |
echo 'Output file already exists: ', $outputFile, PHP_EOL; | |
return; | |
} | |
if (@file_put_contents($outputFile, 'test') === false) { | |
echo 'Could not open file for writing (read-only?): ', $outputFile, PHP_EOL; | |
return; | |
} | |
unlink($outputFile); | |
// Поехали! | |
$reader = ReaderEntityFactory::createXLSXReader(); | |
$reader->open($inputFile); | |
$sheetNames = []; | |
foreach ($reader->getSheetIterator() as $sheet) { | |
/** @var Sheet $sheet */ | |
$sheetNames []= $sheet->getName(); | |
} | |
if (count($sheetNames) < 2) { | |
echo 'XLSX file must contain at least 2 sheets: ', $inputFile, PHP_EOL; | |
return; | |
} | |
$array1 = []; | |
$array2 = []; | |
foreach ($reader->getSheetIterator() as $index => $sheet) { | |
/** @var Sheet $sheet */ | |
foreach ($sheet->getRowIterator() as $row) { | |
/** @var Row $row */ | |
if ($index === 1) { | |
$array1 []= $row->toArray(); | |
} else { | |
$array2 []= $row->toArray(); | |
} | |
} | |
} | |
$reader->close(); | |
array_shift($array1); | |
array_shift($array2); | |
$count1 = count($array1); | |
$count2 = count($array2); | |
echo 'Average calculations: ', $count1, ' × ', $count2, ' ≈ '; | |
echo self::averageNumber($count1 * $count2), PHP_EOL; | |
echo 'Average output rows: ', $count1, ' × ', $number, ' ≈ '; | |
echo self::averageNumber($count1 * $number), PHP_EOL; | |
$defaultStyle = (new StyleBuilder())->setShouldWrapText()->build(); | |
$writer = WriterEntityFactory::createXLSXWriter(); | |
$writer->openToFile($outputFile)->setDefaultRowStyle($defaultStyle); | |
$writer->addRow(WriterEntityFactory::createRowFromArray([ | |
'Массив 1', | |
null, | |
'Массив 2', | |
null, | |
'Близость', | |
])); | |
$writer->addRow(WriterEntityFactory::createRowFromArray([ | |
'ИД', | |
'Значение', | |
'ИД', | |
'Значение', | |
'Расстояние Дамерау—Левенштейна', | |
])); | |
$index = 0; | |
$rows = 0; | |
foreach ($array1 as $row1) { | |
$index++; | |
$percent = 100 * $index / $count1; | |
echo "\r", $index , '/', $count1, sprintf(' %.1f%%', $percent); | |
$id1 = isset($row1[0]) ? intval($row1[0]) : 0; | |
if ($id1 === 0) { | |
continue; | |
} | |
$text1 = isset($row1[1]) ? trim($row1[1]) : ''; | |
if ($text1 === '') { | |
continue; | |
} | |
$distance = []; | |
foreach ($array2 as $row2) { | |
$id2 = isset($row2[0]) ? intval($row2[0]) : 0; | |
if ($id2 === 0) { | |
continue; | |
} | |
$text2 = isset($row2[1]) ? trim($row2[1]) : ''; | |
if ($text2 === '') { | |
continue; | |
} | |
$distance []= [ | |
'id' => $id2, | |
'text' => $text2, | |
'dist' => levenshtein($text1, $text2), | |
]; | |
} | |
usort($distance, [self::class, 'compareDistances']); | |
$top = array_slice($distance, 0, $number); | |
foreach ($top as $topIndex => $item) { | |
$writer->addRow(WriterEntityFactory::createRowFromArray([ | |
$id1, | |
$text1, | |
$item['id'], | |
$item['text'], | |
$item['dist'], | |
])); | |
$rows++; | |
} | |
unset($distance); | |
unset($top); | |
} | |
echo PHP_EOL, 'Writing file. Please wait...'; | |
$writer->close(); | |
$timeDiff = microtime(true) - $timeStart; | |
echo "\r", 'Done with ', $rows, ' rows in ', round($timeDiff), ' seconds.', PHP_EOL; | |
} | |
private function help() | |
{ | |
echo 'Usage: php ', $this->argv[0], ' {input XLSX file} '; | |
echo '{output XLSX file} [n, default ', self::DEFAULT_NUMBER, ']', PHP_EOL; | |
} | |
private static function compareDistances($a, $b): int | |
{ | |
if ($a['dist'] === $b['dist']) { | |
return 0; | |
} | |
return $a['dist'] < $b['dist'] ? -1 : +1; | |
} | |
private static function averageNumber($number): string | |
{ | |
if ($number > 999999999) { | |
return round($number / 1000000000, 1) . 'G'; | |
} | |
if ($number > 999999) { | |
return round($number / 1000000, 1) . 'M'; | |
} | |
if ($number > 999) { | |
return round($number / 1000, 1) . 'k'; | |
} | |
return $number; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment