Created
February 28, 2017 17:23
-
-
Save NeilMadden/67ba9997a054943b99d1b765edd92158 to your computer and use it in GitHub Desktop.
Neil Madden "Functional Programming in Erlang MOOC" Submission for Week 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([index_file/1,get_file_contents/1,show_file_contents/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
%% **** BEGIN SOLUTION **** | |
% We maintain our index using the form as described in the exercise, i.e., | |
% {Word, [{Start, End}, {Start, End}, ...]} | |
% Where each {Start, End} pair is a contiguous range of line numbers on which the | |
% given word occurs. Unlike the exercise, we reverse the order of the list so that | |
% the head element is always the most recently found line. This makes checking and | |
% updating the index cheap as we only have to check the head of the list at any | |
% point. We then | |
% index_file(Name) -- | |
% | |
% Indexes the given file and returns the index as a list of entries of the form | |
% {Word, [{Start,End}, {Start,End}, ...]} | |
% where each {Start, End} pair is a contiguous range of lines on which the given | |
% word occurs. | |
% | |
index_file(Name) -> | |
lists:keysort(1, reverse_indices(index_lines(get_file_contents(Name), 1, []))). | |
% reverse_indices(Index) -- | |
% | |
% Reverses the order of the occurrences for each word in the index, so that they | |
% are in the order in which they occur in the file. | |
% | |
reverse_indices([]) -> | |
[]; | |
reverse_indices([{Word, Occurrences}|Rest]) -> | |
[{Word, lists:reverse(Occurrences)} | reverse_indices(Rest)]. | |
% index_lines(Lines, LineNumber, Index) -- | |
% | |
% Indexes the list of Lines given as input, updating the Index with the LineNumbers | |
% on which each word occurs. Returns the updated index. | |
% | |
index_lines([], _, Index) -> | |
Index; | |
index_lines([Line|Lines], Num, Index) -> | |
% Split the line into words by whitespace and punctuation | |
Words = string:tokens(Line, " \t,.!;:\"\'[](){}&-<`\\"), | |
% Index all words within this line | |
NewIndex = index_words(Words, Num, Index), | |
% Index the remaining lines | |
index_lines(Lines, Num+1, NewIndex). | |
% index_words(Words, LineNumber, Index) -- | |
% | |
% Index all the words in a line. | |
% | |
index_words([], _, Index) -> | |
Index; | |
index_words([Word|Words], LineNum, Index) -> | |
NormalizedWord = normalize(Word), | |
% Find the word in the key list and update it with the new index entry | |
NewIndex = case update_entry(NormalizedWord, LineNum, | |
find(NormalizedWord, Index)) of | |
% false if the word was too boring to index | |
false -> Index; | |
NewEntry -> store(NormalizedWord, Index, NewEntry) | |
end, | |
% Now update the index with the new entry and index the remaining words | |
index_words(Words, LineNum, NewIndex). | |
find(Word, Index) -> | |
lists:keyfind(Word, 1, Index). | |
store(Word, Index, Entry) -> | |
lists:keystore(Word, 1, Index, Entry). | |
% normalize(Word) -- | |
% | |
% Normalizes the given word. Currently just converts to lowercase rather than | |
% performing stemming or other transformations. | |
% | |
normalize(Word) -> | |
string:to_lower(Word). | |
% update_entry(Word, LineNumber, ExistingEntry | false) -- | |
% | |
% Updates an entry in the index to include the given word at the given line number. | |
% If the entry does not exist then a new entry is created. If an entry exists that | |
% already covers this line number, then we do nothing. If an entry exists that | |
% extends up to the previous line, then we extend the range to include this line | |
% too. Otherwise, we add a new range containing just this new line number. | |
% | |
update_entry(Word, _, Entry) when length(Word) < 3 -> | |
Entry; | |
update_entry(Word, LineNum, false) -> | |
{Word, [{LineNum, LineNum}]}; | |
update_entry(Word, LineNum, Entry={Word, [{_,End}|_]}) when End == LineNum -> | |
% Entry already covers this line | |
Entry; | |
update_entry(Word, LineNum, {Word, [{Start,End}|Rest]}) when End == LineNum-1 -> | |
% There is an existing entry that extends to the previous line, so update it | |
{Word, [{Start,LineNum}|Rest]}; | |
update_entry(Word, LineNum, {Word, Rest}) -> | |
% This is a new occurrence that is discontiguous with any existing entry | |
{Word, [{LineNum,LineNum}|Rest]}. | |
% Efficiency considerations: | |
% | |
% The current approach just uses an unsorted key/value list to store the index, | |
% which requires an O(N) linear scan to both find and update entries. Maintaining | |
% a the list in sorted order (e.g., via some balanced binary tree structure) would | |
% improve this to O(log N). This would also avoid an additional separate O(N log N) | |
% sorting stage at the end. | |
% | |
% Even better would be to use Erlang's new maps (http://erlang.org/eeps/eep-0043.html) | |
% that are based on Hash Array Mapped Tries (HAMTs) that provide close to hash-table | |
% O(1) lookup and insertion. This would be fairly simple to change, just replacing | |
% the find/store procedures to use maps:get/2 and map update syntax and updating the | |
% code to pass through an initially empty map. The map key would be the normalized | |
% word, while the value would still be a list of occurrences (in most-recent-first order). | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment