Last active
August 22, 2018 19:13
-
-
Save cwells/c278d9b40a084536d61e4a9d856989c7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
include("./strings.jl") | |
filename, stride, highest = ARGS | |
@time main(filename, parse(Int64, stride), parse(Int64, highest)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Printf | |
Phrase, Frequency = String, Int64 | |
PhraseFrequency = Pair{Phrase, Frequency} | |
FrequencyTable = Dict{Phrase, Frequency} | |
Summary = Base.Iterators.Enumerate{Array{PhraseFrequency, 1}} | |
""" | |
analyze(text, stride) | |
Accepts a block of text and returns a dictionary describing | |
the frequency of phrases of `stride` word length. | |
```julia-repl | |
julia> analyze("this text contains this text which makes this text repetitive", 2) | |
Dict{String,Int64} with 7 entries: | |
"text which" => 1 | |
"this text" => 3 | |
"text repetitive" => 1 | |
"which makes" => 1 | |
⋮ => ⋮ | |
``` | |
""" | |
function analyze(text::String, stride::Int64)::FrequencyTable | |
table = FrequencyTable() | |
words = split(lowercase(text), r"[^a-z0-9'$%]"; keepempty=false) | |
count = length(words) | |
if count < stride return table end | |
for i in 1 : (count - count % stride) - stride + 1 | |
slice = @views words[i : i + stride - 1] | |
phrase = join(slice, " ") | |
table[phrase] = get(table, phrase, 0) + 1 | |
end | |
table | |
end | |
""" | |
top(highest, frequencies) | |
Accepts a dictionary of frequency data and returns a summary | |
of the phrases with the highest frequencies, up to `highest` | |
results, in descending order. | |
```julia-repl | |
julia> collect(top(3, analyze("this text contains this text which makes this text repetitive", 2))) | |
3-element Array{Tuple{Int64,Pair{String,Int64}},1}: | |
(1, "this text" => 3) | |
(2, "text which" => 1) | |
(3, "text repetitive" => 1) | |
``` | |
""" | |
function top(highest::Int64, frequencies::FrequencyTable)::Summary | |
enumerate(sort!( | |
collect(frequencies), | |
by = (phrase, frequency)::PhraseFrequency -> frequency, | |
rev = true | |
)[1 : min(highest, end)]) | |
end | |
""" | |
main(filename, stride, highest) | |
Analyzes the provided file and summarizes the results. | |
```julia-repl | |
julia> @time main("testdata/pg2009.txt", 3) | |
rank freq phrase (length 3) | |
1 320 of the same | |
2 130 the same species | |
3 125 conditions of life | |
4 117 in the same | |
5 110 of natural selection | |
6 104 from each other | |
7 102 species of the | |
8 89 on the other | |
9 81 the other hand | |
10 78 the case of | |
0.407408 seconds (2.28 M allocations: 99.854 MiB, 32.67% gc time) | |
``` | |
""" | |
function main(filename::String, stride::Int64 = 3, highest::Int64 = 10) | |
open(filename, "r") do file | |
text = read(file, String) | |
results = analyze(text, stride) | |
@printf("\n%5s %7s %s (length %i)", "rank", "freq", "phrase", stride) | |
for (rank, (phrase, frequency)) in top(highest, results) | |
@printf("\n%5i %7i %s", rank, frequency, phrase) | |
end | |
@printf("\n\n") | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment