Created
October 26, 2012 21:18
-
-
Save Feh/3961597 to your computer and use it in GitHub Desktop.
When the disk cache is full, grep(1) is single-threaded and CPU-bound. I tried implementing a simple "fgrep -IR" in Go using channels and one goroutine per file, but the program is an order of magnitude slower. Why?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"os" | |
"io" | |
"bufio" | |
"strings" | |
"flag" | |
"bytes" | |
"runtime/pprof" | |
) | |
func lines(fn string) chan string { | |
c := make(chan string) | |
go func() { | |
defer close(c) | |
f, err := os.Open(fn) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, "Cannot open", fn + ":", err) | |
return | |
} | |
defer f.Close() | |
data := make([]byte, 256) | |
count, err := f.Read(data) | |
isBinary := bytes.Contains(data[:count], []byte{0}) | |
f.Seek(0, 0) // rewind | |
if isBinary { | |
return | |
} | |
bf := bufio.NewReader(f) | |
for { | |
line, isPrefix, err := bf.ReadLine() | |
if err == io.EOF { | |
return | |
} | |
if err == nil && !isPrefix { | |
c <- string(line) | |
} | |
} | |
}() | |
return c | |
} | |
func matches(substr, fn string) chan string { | |
c := make(chan string) | |
go func() { | |
defer close(c) | |
for l := range lines(fn) { | |
if strings.Contains(string(l), substr) { | |
c <- fn + ": " + l | |
} | |
} | |
}() | |
return c | |
} | |
func lsr(dir string) (c []string) { | |
c = make([]string, 0) | |
h, err := os.Open(dir) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, "warning:", err) | |
return | |
} | |
defer h.Close() | |
entries, err := h.Readdirnames(0) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, "warning:", err) | |
return | |
} | |
for _, entry := range entries { | |
e := dir + "/" + entry | |
info, err := os.Stat(e) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, "warning: cannot stat("+e+"):", err) | |
continue | |
} | |
if info.IsDir() { | |
c = append(c, lsr(e)...) | |
} else { | |
c = append(c, e) | |
} | |
} | |
return | |
} | |
func fgrepr(str, dir string) { | |
workers := make(chan chan string) | |
go func() { | |
for _, fn := range lsr(dir) { | |
workers <- matches(str, fn) | |
} | |
close(workers) | |
}() | |
for w := range workers { | |
for l := range w { | |
fmt.Println(l) | |
} | |
} | |
} | |
func main() { | |
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") | |
flag.Parse() | |
if *cpuprofile != "" { | |
f, err := os.Create(*cpuprofile) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
pprof.StartCPUProfile(f) | |
defer pprof.StopCPUProfile() | |
} | |
args := flag.Args() | |
if len(args) != 2 { | |
fmt.Println("Usage: fgrep <pattern> <directory>") | |
os.Exit(1) | |
} | |
fgrepr(args[0], args[1]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello,
I think the bottleneck are the goroutines.
With a simpler approach I think I have better performance than grep/fgrep on simple test case (match string).
What do you think ?