Created
June 7, 2022 13:38
-
-
Save pjox/54e6a176679a1fc8cb653048bb4dd737 to your computer and use it in GitHub Desktop.
Count the Number of Documents in OSCAR 21.09 for a given language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"compress/gzip" | |
"errors" | |
"fmt" | |
"log" | |
"os" | |
"path/filepath" | |
"sync" | |
) | |
type numDocs struct { | |
mux sync.Mutex | |
docs int | |
} | |
func (nd *numDocs) sumDocs(fileDocs int) { | |
nd.mux.Lock() | |
defer nd.mux.Unlock() | |
nd.docs += fileDocs | |
return | |
} | |
func extractNCount(path string, numdocs *numDocs) error { | |
count := 0 | |
//open gzip file | |
fi, err := os.Open(path) | |
if err != nil { | |
return err | |
} | |
defer fi.Close() | |
fz, err := gzip.NewReader(fi) | |
if err != nil { | |
return err | |
} | |
defer fz.Close() | |
bufin := bufio.NewReader(fz) | |
for line, err := bufin.ReadString('\n'); err == nil; line, err = bufin.ReadString('\n') { | |
if line == "\n" { | |
count++ | |
} | |
} | |
//There is a double \n at the end of the file | |
count -= 1 | |
fmt.Println(path) | |
fmt.Println(count) | |
numdocs.sumDocs(count) | |
return nil | |
} | |
func walkFiles(done <-chan struct{}, root string) (<-chan string, <-chan error) { | |
paths := make(chan string) | |
errc := make(chan error, 1) | |
go func() { // HL | |
// Close the paths channel after Walk returns. | |
defer close(paths) // HL | |
// No select needed for this send, since errc is buffered. | |
errc <- filepath.Walk(root, func(path string, info os.FileInfo, err error) error { // HL | |
if err != nil { | |
return err | |
} | |
if !info.Mode().IsRegular() { | |
return nil | |
} | |
// Pay attention to where you put the binary | |
if match, _ := filepath.Match("*/*.txt.gz", path); !match { | |
return nil | |
} | |
select { | |
case paths <- path: // HL | |
case <-done: // HL | |
return errors.New("walk canceled") | |
} | |
return nil | |
}) | |
}() | |
return paths, errc | |
} | |
func main() { | |
done := make(chan struct{}) | |
defer close(done) | |
paths, errc := walkFiles(done, os.Args[1]) | |
var wg sync.WaitGroup | |
maxGoroutines := 60 | |
guard := make(chan struct{}, maxGoroutines) | |
numdocs := numDocs{} | |
for path := range paths { | |
wg.Add(1) | |
go func(path string) { | |
guard <- struct{}{} | |
err := extractNCount(path, &numdocs) | |
if err != nil { | |
log.Fatalln(err) | |
} | |
<-guard | |
wg.Done() | |
}(path) | |
} | |
// Check whether the Walk failed. | |
if err := <-errc; err != nil { // HLerrc | |
log.Fatal(err) | |
} | |
wg.Wait() | |
fmt.Printf("\n%d\n", numdocs.docs) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment