Skip to content

Instantly share code, notes, and snippets.

@cwells
Last active August 22, 2018 19:13
Show Gist options
  • Save cwells/c278d9b40a084536d61e4a9d856989c7 to your computer and use it in GitHub Desktop.
Save cwells/c278d9b40a084536d61e4a9d856989c7 to your computer and use it in GitHub Desktop.
include("./strings.jl")
filename, stride, highest = ARGS
@time main(filename, parse(Int64, stride), parse(Int64, highest))
using Printf
Phrase, Frequency = String, Int64
PhraseFrequency = Pair{Phrase, Frequency}
FrequencyTable = Dict{Phrase, Frequency}
Summary = Base.Iterators.Enumerate{Array{PhraseFrequency, 1}}
"""
analyze(text, stride)
Accepts a block of text and returns a dictionary describing
the frequency of phrases of `stride` word length.
```julia-repl
julia> analyze("this text contains this text which makes this text repetitive", 2)
Dict{String,Int64} with 7 entries:
"text which" => 1
"this text" => 3
"text repetitive" => 1
"which makes" => 1
⋮ => ⋮
```
"""
function analyze(text::String, stride::Int64)::FrequencyTable
table = FrequencyTable()
words = split(lowercase(text), r"[^a-z0-9'$%]"; keepempty=false)
count = length(words)
if count < stride return table end
for i in 1 : (count - count % stride) - stride + 1
slice = @views words[i : i + stride - 1]
phrase = join(slice, " ")
table[phrase] = get(table, phrase, 0) + 1
end
table
end
"""
top(highest, frequencies)
Accepts a dictionary of frequency data and returns a summary
of the phrases with the highest frequencies, up to `highest`
results, in descending order.
```julia-repl
julia> collect(top(3, analyze("this text contains this text which makes this text repetitive", 2)))
3-element Array{Tuple{Int64,Pair{String,Int64}},1}:
(1, "this text" => 3)
(2, "text which" => 1)
(3, "text repetitive" => 1)
```
"""
function top(highest::Int64, frequencies::FrequencyTable)::Summary
enumerate(sort!(
collect(frequencies),
by = (phrase, frequency)::PhraseFrequency -> frequency,
rev = true
)[1 : min(highest, end)])
end
"""
main(filename, stride, highest)
Analyzes the provided file and summarizes the results.
```julia-repl
julia> @time main("testdata/pg2009.txt", 3)
rank freq phrase (length 3)
1 320 of the same
2 130 the same species
3 125 conditions of life
4 117 in the same
5 110 of natural selection
6 104 from each other
7 102 species of the
8 89 on the other
9 81 the other hand
10 78 the case of
0.407408 seconds (2.28 M allocations: 99.854 MiB, 32.67% gc time)
```
"""
function main(filename::String, stride::Int64 = 3, highest::Int64 = 10)
open(filename, "r") do file
text = read(file, String)
results = analyze(text, stride)
@printf("\n%5s %7s %s (length %i)", "rank", "freq", "phrase", stride)
for (rank, (phrase, frequency)) in top(highest, results)
@printf("\n%5i %7i %s", rank, frequency, phrase)
end
@printf("\n\n")
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment