Last active
October 29, 2018 14:12
-
-
Save jdhenckel/311c73f199f628bf2106348f32405658 to your computer and use it in GitHub Desktop.
CSV Parser (comma delimited values) written in Java and works with EXCEL exported files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//------------------------------------------------------------------------------- | |
// This file is made available under the Creative Commons CC0 1.0 Universal Public Domain Dedication. | |
// The person who associated a work with this deed has dedicated the work to the public domain by | |
// waiving all of his or her rights to the work worldwide under copyright law, including all related | |
// and neighboring rights, to the extent allowed by law. You can copy, modify, distribute and perform | |
// the work, even for commercial purposes, all without asking permission. | |
import java.io.IOException; | |
import java.io.Reader; | |
import java.io.StringReader; | |
import java.util.ArrayList; | |
import java.util.List; | |
//-------------------------------------------------------------------------------- | |
// Parser for files that contain comma separated values. | |
// This is designed to handle EXCEL exported files. For example | |
/* ==== SAMPLE BEGIN ==== | |
"Name", "Age", Description | |
John Henckel,52,This is a string of text | |
Carin Pie,28,"This | |
text is broken | |
over \"several\" lines." | |
Phen "Bob" Weirsby,,missing age | |
==== SAMPLE END ==== */ | |
// Notice that newline is a record delimiter, except within quotes. Also within quotes | |
// you can have escape sequences \n \" \\ \t. Other escape sequences are not changed. | |
// The " character is allowed in a non-quoted field if it is not the first character. | |
// Fields not in quotes are trimmed for whitespace. | |
public class ParserForCSV { | |
//-------------------------------------------------------------------------------- | |
// Main function. Read entire file into list of rows. | |
// messages - Any warning messages are appended here. | |
public static List<String[]> parse(Reader file, List<String> messages) throws IOException { | |
List<String[]> result = new ArrayList<String[]>(); | |
int n = 10; | |
for(int i = 0;; ++i) { | |
String[] row = parseRecord(file, messages, i, n); | |
if (row == null) break; | |
result.add(row); | |
if (row.length > n) n = Math.min(row.length, 2*n); | |
} | |
return result; | |
} | |
//-------------------------------------------------------------------------------- | |
// This reads a record from a CSV file into an array of strings. | |
// Returns null for EOF. | |
// The hint is the expected number of fields per record. | |
public static String[] parseRecord(Reader file, List<String> messages, int rowNumber, int hint) throws IOException { | |
StringBuilder sb = new StringBuilder(); | |
List<String> row = new ArrayList<String>(Math.min(Math.max(10, hint), 1000)); | |
for(int i = 0;;) { | |
int c = parseField(file, sb, messages, rowNumber, i); | |
if (i==0 && sb.length() == 0) { | |
if (c == -1) | |
return null; // end of the file | |
if (c == '\n') | |
continue; // ignore blank lines in the middle of the file | |
} | |
row.add(sb.toString()); | |
if (c != ',') break; | |
++i; | |
} | |
return row.toArray(new String[row.size()]); | |
} | |
//-------------------------------------------------------------------------------- | |
// This reads a single FIELD item from a CSV file into a string buffer | |
// Returns the last character read, which is comma, newline, or EOF (-1). | |
// Append any warnings to messages. | |
public static int parseField(Reader file, StringBuilder result, List<String> messages, int row, int col) throws IOException { | |
String dump = null; | |
result.setLength(0); | |
int c = file.read(); | |
int numChars = 0; // number of non-white chars that are added to the result | |
for (;;) { | |
if (c == '\n' || c == ',' || c == -1) { | |
if (dump != null) | |
messages.add("WARN: Ignoring data in record "+row+" field "+(col+1)+": " + dump); | |
if (numChars > 0) { | |
// trim trailing whitespace for non-quoted fields | |
while (numChars > 1 && Character.isWhitespace(result.charAt(numChars - 1))) --numChars; | |
result.setLength(numChars); | |
} | |
return c; | |
} | |
if (numChars == 0 && c == '"') { | |
result.setLength(0); // Ignore whitespace in front of quotes | |
for (;;) { | |
c = file.read(); | |
if (c == '"' || c == -1) { | |
if (c == -1) { | |
messages.add("WARN: Unexpected EOF in record "+row+" field "+(col+1)); | |
return c; | |
} | |
break; | |
} | |
if (c == '\\') { | |
c = file.read(); | |
// Interpret \n, \t, \\, and \" in the strings | |
if (c == 'n') c = '\n'; | |
else if (c == 't') c = '\t'; | |
else if (c != '"' && c != '\\') result.append('\\'); | |
} | |
result.append((char)c); | |
} | |
// The value -1 means we finished parsing the quotes and now we're looking for a comma or newline. | |
numChars = -1; | |
} | |
else { | |
if (numChars == -1) { | |
if (dump != null) | |
dump += (char)c; | |
else if (!Character.isWhitespace(c)) | |
dump = "" + (char) c; | |
} else { | |
if (numChars > 0 || !Character.isWhitespace(c)) { | |
result.append((char)c); | |
++numChars; | |
} | |
} | |
} | |
c = file.read(); | |
} | |
} | |
public static void TestCase() { | |
String data = | |
"\"Name\", \"Age\", Description\n" + | |
"John Henckel,52,This is a string of text\n" + | |
"Thomas Moore,\"52\" years old\n" + | |
"Carin Pie,28,\"This\n" + | |
"text is broken\n" + | |
"over \\\"several\\\" lines.\"\n" + | |
"Phen \"Bob\" Weirsby,,\"missing age\n"; | |
List<String> messages = new ArrayList<>(); | |
List<String[]> result; | |
try { | |
result = parse(new StringReader(data), messages); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
return; | |
} | |
int i = 0; | |
System.out.println("---- result data -----"); | |
for (String[] row: result) { | |
System.out.println("BEGIN ROW " + i++); | |
for (String item: row) System.out.println(" DATA: \""+item+"\""); | |
} | |
System.out.println("---- parser messages -----"); | |
for (String m: messages) { | |
System.out.println(m); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment