Last active
September 15, 2024 14:50
-
-
Save nviet/0c087ca9a2720cc97924963ae3b7c625 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Run this script in CLI mode, e.g: php parse_pdf.php full_path_to_input_file.txt | |
$time_start = microtime(true); | |
$input_file = $argv[1]; | |
$handle_input = fopen($input_file, "r"); | |
$handle_output = fopen($input_file . ".csv", "w+"); | |
if(!$handle_input) | |
{ | |
exit("unable to input file"); | |
} | |
if(!$handle_output) | |
{ | |
exit("unable to output file"); | |
} | |
$skip_line = true; | |
$row = array("Date", "Doc No.", "Debit", "Credit", "Balance", "Details"); | |
while (($line = fgets($handle_input)) !== false) | |
{ | |
$trimmed_line = ltrim($line); | |
if (strlen($trimmed_line) < 20 && preg_match("/^\d{2}\/\d{2}\/\d{4}/", $trimmed_line)) | |
{ | |
foreach($row as $key => &$cell) | |
{ | |
$cell = str_replace(array("\r\n", "\n", "\r"), "", $cell); | |
$cell = trim(preg_replace('/[\t\n\r\s]+/', " ", $cell)); | |
} | |
fputcsv($handle_output, $row); | |
$row = array("date" => "", "docno" => "", "debit" => "", "credit" => "", "balance" => "", "details" => ""); | |
$row["date"] .= substr($trimmed_line, 0, 11); | |
$skip_line = false; | |
continue; | |
} | |
if (strpos($trimmed_line, "Postal address") === 0) | |
{ | |
$skip_line = true; | |
} | |
if($skip_line === true) | |
{ | |
continue; | |
} | |
$row["docno"] .= trim(substr($line, 0, 11)); | |
$row["debit"] .= trim(substr($line, 40, 19)); | |
$row["credit"] .= trim(substr($line, 59, 30)); | |
$row["balance"] .= trim(substr($line, 90, 19)); | |
$row["details"] .= trim(substr($line, 110), "\n\r"); | |
} | |
foreach($row as $key => &$cell) | |
{ | |
$cell = str_replace(array("\r\n", "\n", "\r"), "", $cell); | |
$cell = trim(preg_replace('/[\t\n\r\s]+/', " ", $cell)); | |
} | |
fputcsv($handle_output, $row); | |
fclose($handle_input); | |
fclose($handle_output); | |
$time_end = microtime(true); | |
$execution_time = number_format($time_end - $time_start, 2); | |
echo "Execution time: " . $execution_time . "s" . PHP_EOL; | |
echo "Peak memory usage: " . number_format((memory_get_peak_usage(true)/1024/1024), 2) . "MB" . PHP_EOL; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment