cwells · August 22, 2018 19:13
diff --git a/main.jl b/main.jl
 include("./strings.jl")

 filename, stride, highest = ARGS
 @time main(filename, parse(Int64, stride), parse(Int64, highest))
diff --git a/strings.jl b/strings.jl
 using Printf

 Phrase, Frequency = String, Int64
 PhraseFrequency = Pair{Phrase, Frequency}
 FrequencyTable = Dict{Phrase, Frequency}
 Summary = Base.Iterators.Enumerate{Array{PhraseFrequency, 1}}

 """
    analyze(text, stride)

 Accepts a block of text and returns a dictionary describing
 the frequency of phrases of `stride` word length.

 ```julia-repl
 julia> analyze("this text contains this text which makes this text repetitive", 2)

 Dict{String,Int64} with 7 entries:
  "text which"      => 1
  "this text"       => 3
  "text repetitive" => 1
  "which makes"     => 1
  ⋮                 => ⋮
 ```
 """
 function analyze(text::String, stride::Int64)::FrequencyTable
  table = FrequencyTable()
  words = split(lowercase(text), r"[^a-z0-9'$%]"; keepempty=false)
  count = length(words)

  if count < stride return table end

  for i in 1 : (count - count % stride) - stride + 1
    slice = @views words[i : i + stride - 1]
    phrase = join(slice, " ")
    table[phrase] = get(table, phrase, 0) + 1
  end

  table
 end

 """
    top(highest, frequencies)

 Accepts a dictionary of frequency data and returns a summary
 of the phrases with the highest frequencies, up to `highest`
 results, in descending order.

 ```julia-repl
 julia> collect(top(3, analyze("this text contains this text which makes this text repetitive", 2)))

 3-element Array{Tuple{Int64,Pair{String,Int64}},1}:
 (1, "this text" => 3)
 (2, "text which" => 1)
 (3, "text repetitive" => 1)
 ```
 """
 function top(highest::Int64, frequencies::FrequencyTable)::Summary
  enumerate(sort!(
    collect(frequencies),
    by  = (phrase, frequency)::PhraseFrequency -> frequency,
    rev = true
  )[1 : min(highest, end)])
 end

 """
    main(filename, stride, highest)

 Analyzes the provided file and summarizes the results.

 ```julia-repl
 julia> @time main("testdata/pg2009.txt", 3)

 rank    freq   phrase (length 3)
    1     320   of the same
    2     130   the same species
    3     125   conditions of life
    4     117   in the same
    5     110   of natural selection
    6     104   from each other
    7     102   species of the
    8      89   on the other
    9      81   the other hand
   10      78   the case of

  0.407408 seconds (2.28 M allocations: 99.854 MiB, 32.67% gc time)
 ```
 """
 function main(filename::String, stride::Int64 = 3, highest::Int64 = 10)
  open(filename, "r") do file
    text = read(file, String)
    results = analyze(text, stride)
    @printf("\n%5s %7s   %s (length %i)", "rank", "freq", "phrase", stride)
    for (rank, (phrase, frequency)) in top(highest, results)
      @printf("\n%5i %7i   %s", rank, frequency, phrase)
    end
    @printf("\n\n")
  end
 end
	include("./strings.jl")

	filename, stride, highest = ARGS
	@time main(filename, parse(Int64, stride), parse(Int64, highest))
	using Printf

	Phrase, Frequency = String, Int64
	PhraseFrequency = Pair{Phrase, Frequency}
	FrequencyTable = Dict{Phrase, Frequency}
	Summary = Base.Iterators.Enumerate{Array{PhraseFrequency, 1}}

	"""
	analyze(text, stride)

	Accepts a block of text and returns a dictionary describing
	the frequency of phrases of `stride` word length.

	```julia-repl
	julia> analyze("this text contains this text which makes this text repetitive", 2)

	Dict{String,Int64} with 7 entries:
	"text which" => 1
	"this text" => 3
	"text repetitive" => 1
	"which makes" => 1
	⋮ => ⋮
	```
	"""
	function analyze(text::String, stride::Int64)::FrequencyTable
	table = FrequencyTable()
	words = split(lowercase(text), r"[^a-z0-9'$%]"; keepempty=false)
	count = length(words)

	if count < stride return table end

	for i in 1 : (count - count % stride) - stride + 1
	slice = @views words[i : i + stride - 1]
	phrase = join(slice, " ")
	table[phrase] = get(table, phrase, 0) + 1
	end

	table
	end

	"""
	top(highest, frequencies)

	Accepts a dictionary of frequency data and returns a summary
	of the phrases with the highest frequencies, up to `highest`
	results, in descending order.

	```julia-repl
	julia> collect(top(3, analyze("this text contains this text which makes this text repetitive", 2)))

	3-element Array{Tuple{Int64,Pair{String,Int64}},1}:
	(1, "this text" => 3)
	(2, "text which" => 1)
	(3, "text repetitive" => 1)
	```
	"""
	function top(highest::Int64, frequencies::FrequencyTable)::Summary
	enumerate(sort!(
	collect(frequencies),
	by = (phrase, frequency)::PhraseFrequency -> frequency,
	rev = true
	)[1 : min(highest, end)])
	end

	"""
	main(filename, stride, highest)

	Analyzes the provided file and summarizes the results.

	```julia-repl
	julia> @time main("testdata/pg2009.txt", 3)

	rank freq phrase (length 3)
	1 320 of the same
	2 130 the same species
	3 125 conditions of life
	4 117 in the same
	5 110 of natural selection
	6 104 from each other
	7 102 species of the
	8 89 on the other
	9 81 the other hand
	10 78 the case of

	0.407408 seconds (2.28 M allocations: 99.854 MiB, 32.67% gc time)
	```
	"""
	function main(filename::String, stride::Int64 = 3, highest::Int64 = 10)
	open(filename, "r") do file
	text = read(file, String)
	results = analyze(text, stride)
	@printf("\n%5s %7s %s (length %i)", "rank", "freq", "phrase", stride)
	for (rank, (phrase, frequency)) in top(highest, results)
	@printf("\n%5i %7i %s", rank, frequency, phrase)
	end
	@printf("\n\n")
	end
	end