Last active
June 20, 2022 13:00
-
-
Save matthewmorrone/33f5bdab4161651b92d3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BM25 = {}; | |
// see attached file | |
BM25.stopWords = []; | |
// TODO: https://tartarus.org/martin/PorterStemmer/js.txt | |
BM25.stemmer = function(term) { | |
return term; | |
} | |
BM25.tokenize = function(text) { | |
text = text | |
.toLowerCase() | |
.replace(/\W/g, ' ') | |
.replace(/\s+/g, ' ') | |
.trim() | |
.split(' ') | |
.map(function(a) { return BM25.stemmer(a); }); | |
// Filter out stopWords | |
let out = []; | |
for (let i = 0, len = text.length; i < len; i++) { | |
if (BM25.stopWords.indexOf(text[i]) === -1) { | |
out.push(text[i]); | |
} | |
} | |
return out; | |
}; | |
BM25.addDocument = function(doc) { | |
if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); }; | |
if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); }; | |
// Raw tokenized list of words | |
let tokens = BM25.tokenize(doc.body); | |
// Will hold unique terms and their counts and frequencies | |
let _terms = {}; | |
// docObj will eventually be added to the documents database | |
let docObj = {id: doc.id, tokens: tokens, body: doc.body}; | |
// Count number of terms | |
docObj.termCount = tokens.length; | |
// Increment totalDocuments | |
this.totalDocuments++; | |
// Readjust averageDocumentLength | |
this.totalDocumentTermLength += docObj.termCount; | |
this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments; | |
// Calculate term frequency | |
// First get terms count | |
for (let i = 0, len = tokens.length; i < len; i++) { | |
let term = tokens[i]; | |
if (!_terms[term]) { | |
_terms[term] = { | |
count: 0, | |
freq: 0 | |
}; | |
}; | |
_terms[term].count++; | |
} | |
// Then re-loop to calculate term frequency. | |
// We'll also update inverse document frequencies here. | |
let keys = Object.keys(_terms); | |
for (let i = 0, len = keys.length; i < len; i++) { | |
let term = keys[i]; | |
// Term Frequency for this document. | |
_terms[term].freq = _terms[term].count / docObj.termCount; | |
// Inverse Document Frequency initialization | |
if (!this.terms) { | |
this.terms = {}; | |
} | |
if (!this.terms[term]) { | |
this.terms[term] = { | |
n: 0, // Number of docs this term appears in, uniquely | |
idf: 0 | |
}; | |
} | |
this.terms[term].n++; | |
}; | |
// Calculate inverse document frequencies | |
// This is SLOWish so if you want to index a big batch of documents, | |
// comment this out and run it once at the end of your addDocuments run | |
// If you're only indexing a document or two at a time you can leave this in. | |
// this.updateIdf(); | |
// Add docObj to docs db | |
docObj.terms = _terms; | |
if (!this.documents) this.documents = {}; | |
this.documents[docObj.id] = docObj; | |
}; | |
BM25.updateIdf = function() { | |
let keys = Object.keys(this.terms); | |
for (let i = 0, len = keys.length; i < len; i++) { | |
let term = keys[i]; | |
let num = (this.totalDocuments - this.terms[term].n + 0.5); | |
let denom = (this.terms[term].n + 0.5); | |
this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01); | |
} | |
}; | |
BM25.search = function(query) { | |
let queryTerms = BM25.tokenize(query); | |
let results = []; | |
// Look at each document in turn. There are better ways to do this with inverted indices. | |
let keys = Object.keys(this.documents); | |
for (let j = 0, nDocs = keys.length; j < nDocs; j++) { | |
let id = keys[j]; | |
// The relevance score for a document is the sum of a tf-idf-like | |
// calculation for each query term. | |
this.documents[id]._score = 0; | |
// Calculate the score for each query term | |
for (let i = 0, len = queryTerms.length; i < len; i++) { | |
let queryTerm = queryTerms[i]; | |
// We've never seen this term before so IDF will be 0. | |
// Means we can skip the whole term, it adds nothing to the score | |
// and isn't in any document. | |
if (typeof this.terms[queryTerm] === 'undefined') { | |
continue; | |
} | |
// This term isn't in the document, so the TF portion is 0 and this | |
// term contributes nothing to the search score. | |
if (typeof this.documents[id].terms[queryTerm] === 'undefined') { | |
continue; | |
} | |
// The term is in the document, let's go. | |
// The whole term is : | |
// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength)) | |
// IDF is pre-calculated for the whole docset. | |
let idf = this.terms[queryTerm].idf; | |
// Numerator of the TF portion. | |
let num = this.documents[id].terms[queryTerm].count * (this.k1 + 1); | |
// Denomenator of the TF portion. | |
let denom = this.documents[id].terms[queryTerm].count + (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength))); | |
// Add this query term to the score | |
this.documents[id]._score += idf * num / denom; | |
} | |
if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) { | |
results.push(this.documents[id]); | |
} | |
} | |
results.sort(function(a, b) { return b._score - a._score; }); | |
return results.slice(0, 10); | |
}; | |
// TODO: add example |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BM25.stopwods = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herse', '', 'him', 'himse', '', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itse', '', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myse', '', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
⚠️ at line 134
If I'm correct it's not the
.count
but the.freq
to get.