Last active
August 29, 2015 14:01
-
-
Save dilap/46ef747dbd642ef6e834 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"log" | |
"os" | |
"runtime" | |
) | |
// (1) splits on tab-delimitted fields; handles quotes (and quote-escaped quotes w/in quotes) properly | |
// (2) substitues spaces for newlines and tabs (allowing the output to be printed safely w/o quotes) | |
// caveats: - no error reporting | |
// - (1) and (2) are separate functions, really, but combined here for a bit of a speed boost (~10%) | |
func substitutingSplitTabsFunc(newRow *bool) bufio.SplitFunc { | |
return func(data []byte, atEOF bool) (advance int, token []byte, err error) { | |
if len(data) == 0 || data[0] != '"' { // easy case: unquoted field | |
i := 0 | |
for i < len(data) && data[i] != '\t' && data[i] != '\n' { | |
i++ | |
} | |
if i == len(data) && !atEOF { | |
// request more data | |
return 0, nil, nil | |
} | |
*newRow = i == len(data) || data[i] == '\n' | |
if i == len(data) { | |
return len(data), data, nil | |
} else { | |
return i + 1, data[:i], nil | |
} | |
} else { // hard case: quoted field | |
i := 1 | |
inlineQuotes, inlineNewline := false, false | |
for { | |
if i == len(data) { | |
if !atEOF { // need more data | |
return 0, nil, nil | |
} else { | |
// todo: error, unclosed quote | |
advance = len(data) | |
*newRow = true | |
break | |
} | |
} else if data[i] == '"' { | |
if i+1 == len(data) { | |
if !atEOF { // request more data | |
return 0, nil, nil | |
} else { | |
*newRow = true | |
advance = len(data) | |
break | |
} | |
} else if data[i+1] == '\n' { | |
*newRow = true | |
advance = i + 2 | |
break | |
} else if data[i+1] == '\t' { | |
*newRow = false | |
advance = i + 2 | |
break | |
} else { | |
inlineQuotes = true | |
} | |
} else if data[i] == '\n' || data[i] == '\t' { | |
inlineNewline = true | |
} | |
i++ | |
} | |
token = data[1:i] | |
if inlineQuotes { | |
i, t := 0, 0 | |
for i < len(token) { | |
token[t] = token[i] | |
if token[i] == '"' { | |
if i+1 < len(token) && token[i+1] == '"' { | |
i++ | |
} | |
// todo: error: solitary quote char | |
} | |
i++ | |
t++ | |
} | |
token = token[:t] | |
} | |
if inlineNewline { | |
for i, c := range token { | |
if c == '\n' || c == '\t' { | |
token[i] = ' ' | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
// Make scanner to split on tab-delimitted, possibly-quoted csv fields. Each call to scanner.Scan() will return the next field; when the returned field is the last field in a row, newRow will be true. This method of parsing CSV is (as of this writing, Oct 2013), much faster than the built-in csv module. Also replaces tabs and newlines with spaces. | |
func SplitTabsAndSub(s *bufio.Scanner) (newRow *bool) { | |
newRow = new(bool) | |
s.Split(substitutingSplitTabsFunc(newRow)) | |
return | |
} | |
func main() { | |
runtime.GOMAXPROCS(2) | |
if len(os.Args) < 2 { | |
fmt.Println(`usage: %s FNAME > CLEANFNAME | |
Read tab-separated file FNAME and write a simplified version to | |
stdout. Simplified => tabs and newlines are replaced with spaces, no quotes | |
are used | |
`, os.Args[0]) | |
os.Exit(64) | |
} | |
f, err := os.Open(os.Args[1]) | |
if err != nil { | |
log.Fatal(err) | |
} | |
scanner := bufio.NewScanner(f) | |
newRow := SplitTabsAndSub(scanner) | |
w := bufio.NewWriter(os.Stdout) | |
newline, tab := []byte("\n"), []byte("\t") | |
for scanner.Scan() { | |
w.Write(scanner.Bytes()) | |
if *newRow { | |
w.Write(newline) | |
} else { | |
w.Write(tab) | |
} | |
} | |
w.Flush() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment