Skip to content

Instantly share code, notes, and snippets.

@nviet
Last active September 15, 2024 14:50
Show Gist options
  • Save nviet/0c087ca9a2720cc97924963ae3b7c625 to your computer and use it in GitHub Desktop.
Save nviet/0c087ca9a2720cc97924963ae3b7c625 to your computer and use it in GitHub Desktop.
<?php
// Run this script in CLI mode, e.g: php parse_pdf.php full_path_to_input_file.txt
$time_start = microtime(true);
$input_file = $argv[1];
$handle_input = fopen($input_file, "r");
$handle_output = fopen($input_file . ".csv", "w+");
if(!$handle_input)
{
exit("unable to input file");
}
if(!$handle_output)
{
exit("unable to output file");
}
$skip_line = true;
$row = array("Date", "Doc No.", "Debit", "Credit", "Balance", "Details");
while (($line = fgets($handle_input)) !== false)
{
$trimmed_line = ltrim($line);
if (strlen($trimmed_line) < 20 && preg_match("/^\d{2}\/\d{2}\/\d{4}/", $trimmed_line))
{
foreach($row as $key => &$cell)
{
$cell = str_replace(array("\r\n", "\n", "\r"), "", $cell);
$cell = trim(preg_replace('/[\t\n\r\s]+/', " ", $cell));
}
fputcsv($handle_output, $row);
$row = array("date" => "", "docno" => "", "debit" => "", "credit" => "", "balance" => "", "details" => "");
$row["date"] .= substr($trimmed_line, 0, 11);
$skip_line = false;
continue;
}
if (strpos($trimmed_line, "Postal address") === 0)
{
$skip_line = true;
}
if($skip_line === true)
{
continue;
}
$row["docno"] .= trim(substr($line, 0, 11));
$row["debit"] .= trim(substr($line, 40, 19));
$row["credit"] .= trim(substr($line, 59, 30));
$row["balance"] .= trim(substr($line, 90, 19));
$row["details"] .= trim(substr($line, 110), "\n\r");
}
foreach($row as $key => &$cell)
{
$cell = str_replace(array("\r\n", "\n", "\r"), "", $cell);
$cell = trim(preg_replace('/[\t\n\r\s]+/', " ", $cell));
}
fputcsv($handle_output, $row);
fclose($handle_input);
fclose($handle_output);
$time_end = microtime(true);
$execution_time = number_format($time_end - $time_start, 2);
echo "Execution time: " . $execution_time . "s" . PHP_EOL;
echo "Peak memory usage: " . number_format((memory_get_peak_usage(true)/1024/1024), 2) . "MB" . PHP_EOL;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment