-
-
Save hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c to your computer and use it in GitHub Desktop.
fast longest common substring - by suffix array
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="utf-8"> | |
<title>Redundancy checker</title> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<!--<script src="lrs.min.js"></script>--> | |
<style> | |
html { | |
font-family: 'Verdana', sans-serif; | |
margin: 0; | |
padding: 0; | |
} | |
pre { | |
margin: 0px; | |
} | |
#main { | |
background-color: #fff; | |
border: 1px solid #999; | |
border-spacing: 5px; | |
width: 100%; | |
} | |
#main>div { | |
display: flex; | |
} | |
#main>div>div { | |
background-color: #eee; | |
padding: 3px 6px 3px 6px; | |
margin: 1px; | |
flex: 0 0 110px; | |
} | |
#main>div>div:nth-child(2n) { | |
flex: 1 0 auto; | |
} | |
#main>div>div:nth-child(3n) { | |
flex: 0 1 145px; | |
} | |
.header { | |
font-size: 1.2em; | |
font-weight: bold; | |
} | |
center { | |
display: flex; | |
align-items: top; | |
justify-content: center; | |
} | |
#file { | |
display: none; | |
} | |
#btn { | |
margin-bottom: 16px; | |
padding: 0px 5px 0px 5px; | |
height: 72px; | |
flex: 1 1 0%; | |
max-width: 256px; | |
text-align: center; | |
font-size: 36px; | |
line-height: 72px; | |
background: rgb(42, 128, 225) none repeat scroll 0% 0%; | |
color: rgb(255, 255, 255); | |
border-radius: 12px; | |
box-shadow: inset 0 5px 5px rgba(0,0,0,0.05),0 0 5px rgba(0,0,0,0.8); | |
} | |
#btn:hover { | |
background: rgb(82, 64, 255) none repeat scroll 0% 0%; | |
} | |
#chk { | |
display: flex; | |
margin: 0px 0px 16px 5px; | |
align-items: center; | |
width: 400px; | |
padding: 0px 6px 0px 6px; | |
background-color: #efefef; | |
border-radius: 9px; | |
} | |
</style> | |
</head> | |
<body> | |
<center> | |
<input type="file" name="file" id="file"> | |
<label id="btn" for="file" tabindex="0">Browse</label> | |
| |
<div id="chk"> | |
<input type="checkbox" id="str" checked="checked"> | |
<label for="str" tabindex="1" style="text-align: left; padding-left: 5px;"> | |
Read file as string, rather than binary. This may result in incorrect "Occurs at:" numbers, but will increase search speed significantly. | |
</label> | |
</div> | |
</center> | |
<div id="main"> | |
</div> | |
<script> | |
if(function(){var r=Math.floor,t=function(r){return r};function n(r,n){var e,o,f=r.length,s=[],i=f,u=-1,c=0;for(n=n||t;i--;)u=Math.max(n(r[i]),u);for(o=(u>>24?32:u>>16&&24)||u>>8&&16||8;c<o;c+=4){for(i=16;i--;)s[i]=[];for(i=f;i--;)s[n(r[i])>>c&15].push(r[i]);for(e=0;e<16;e++)for(u=s[e].length;u--;)r[++i]=s[e][u]}return r}function e(r){return"function"==typeof r?r:function(r){return"[object String]"==Object.prototype.toString.call(r)}(r)?function(t){return r.charCodeAt(t)}:function(t){return r[t]}}this.suffixArray=function(t,o){var f;return o="number"==typeof(f=o)||f instanceof Number?o:t.length,function t(e,o,f){var s,i,u,c=[],a=[],l=r(2*o/3),g=o-l,d=l+1>>1,b=l,h=0,p=[],y=[];if(1==o)return[[0],[0]];u=function(r){return r>=o?0:e(r)};for(;b--;)c[b]=1+(3*b>>1);for(b=3;b--;)n(c,function(r){return u(b+r)});h=a[r(c[0]/3)+(c[0]%3==1?0:d)]=1;for(b=1;b<l;b++)u(c[b])==u(c[b-1])&&u(c[b]+1)==u(c[b-1]+1)&&u(c[b]+2)==u(c[b-1]+2)||h++,a[r(c[b]/3)+(c[b]%3==1?0:d)]=h;if(h<l)for(a=t(function(r){return a[r]},l,!0),b=l;b--;)c[b]=a[b]<d?3*a[b]+1:3*(a[b]-d)+2;for(b=l;b--;)p[c[b]]=b;p[o]=-1;p[o+1]=-2;i=function(r,t){return u(r)-u(t)||(r%3==2?u(r+1)-u(t+1)||p[r+2]-p[t+2]:p[r+1]-p[t+1])};a=o%3==1?[o-1]:[];for(b=0;b<l;b++)c[b]%3==1&&a.push(c[b]-1);n(a,function(r){return u(r)});if(f){for(b=0,h=0,s=0;b<l&&h<g;)y[s++]=i(c[b],a[h])<0?c[b++]:a[h++];for(;b<l;)y[s++]=c[b++];for(;h<g;)y[s++]=a[h++];return y}var v,S,x=[],j=[0],m=0;for(b=0,h=0,s=0;b<l&&h<g;)if(v=s,y[s++]=i(c[b],a[h])<0?c[b++]:a[h++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}for(;b<l;)if(v=s,y[s++]=c[b++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}for(;h<g;)if(v=s,y[s++]=a[h++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}return[y,j]}(e(t),o)},this.LRS=function(r,t,n){t=t||3,n=n||10,nots=r.constructor===Uint8Array,decoder=new TextDecoder;for(var e,o,f,[i,u]=suffixArray(r),c={},a=nots?decoder.decode(r):r,l=0;l<i.length;l++)if(!((e=u[l])<2||(substring=nots?r.subarray(i[l],i[l]+e):r.substring(i[l],i[l]+e),substring=nots?decoder.decode(substring):substring,(o=a.split(substring).length-1)<t))){for(s in f=!1,c)if(s.length>=e){if(s.includes(substring)){f=!0;break}}else substring.includes(s)&&delete c[s];f||(c[substring]=[o,i[l]])}return Object.entries(c).sort((r,t)=>t[1][0]*t[0].length**2-r[1][0]*r[0].length**2).slice(0,n).reduce((r,t)=>(r[t[0]]=c[t[0]],r),{})},this.suffixArray.bsort=n}.call(),"object"==typeof process){var fs=require("fs"),file=fs.readFileSync(process.argv.length>2?process.argv[2]:"suffixarray.js","utf8");console.log(LRS(file))}else"function"==typeof importScripts&&addEventListener("message",r=>postMessage(LRS(r.data))); | |
</script> | |
<script> | |
(function() { | |
var msgs = ['Browse', 'Browse again'], b; | |
function makeWorker(f) { | |
//var blob = new Blob(['(', f.toString(), ')();'], { type: 'text/javascript'}); | |
var blob = new Blob([f.toString()], {type: 'text/javascript'}); | |
var blobUrl = URL.createObjectURL(blob); | |
var worker = new Worker(blobUrl); | |
URL.revokeObjectURL(blobUrl); | |
return worker; | |
} | |
//var w = makeWorker(e => postMessage(LRS(e.data))); | |
var w = makeWorker(document.getElementsByTagName('script')[0].innerHTML.trim()); | |
w.addEventListener('message', function(e) { | |
//var res = LRS(e.target.result); | |
var res = e.data; | |
var m = document.getElementById('main'); | |
m.innerHTML = ' <div class="header"><div>Occurs at:</div><div>Substring:</div><div>Found count:</div></div>\n'; | |
for (r in res) | |
m.innerHTML += ' <div><div>' + res[r][1] + '</div><div><pre>' + r + '</pre></div><div>' + res[r][0] + '</div></div>\n'; | |
//document.getElementById('btn').textContent = msgs[Math.floor(Math.random() * msgs.length)]; | |
document.getElementById('btn').textContent = msgs[b=b?0:1]; | |
}); | |
function selectHandler(e) { | |
var reader = new FileReader(); | |
var d = new TextDecoder(); | |
reader.onload = function(e) { | |
document.getElementById('btn').textContent = 'Processing...'; | |
w.postMessage((typeof e.target.result==='string')?e.target.result:(new Uint8Array(e.target.result))); | |
}; | |
if (document.getElementById('str').checked) | |
reader.readAsText(e.target.files[0]); | |
else | |
reader.readAsArrayBuffer(e.target.files[0]); | |
} | |
document.getElementById('file').addEventListener('change', selectHandler, false); | |
})(); | |
</script> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# From: https://gist.github.com/hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c | |
# Original from: https://gist.github.com/hynekcer/fa340f3b63826168ffc0c4b33310ae9c | |
"""Find the longest repeated substring. | |
"Efficient way to find longest duplicate string for Python (From Programming Pearls)" | |
http://stackoverflow.com/questions/13560037/ | |
The algorithm is based on "Prefix doubling". | |
The worst time complexity is O(n (log n)^2). Memory requirements are linear. | |
""" | |
import time | |
from random import randint | |
import itertools | |
import sys | |
import unittest | |
from itertools import groupby | |
from operator import itemgetter | |
import logging | |
log = logging.getLogger(__name__) | |
log.setLevel(logging.INFO) | |
try: | |
log.addHandler(logging.NullHandler()) | |
except AttributeError: | |
pass | |
cache = False | |
def run(): | |
if sys.argv[1:] == ['-']: | |
print("(Return/Enter and CTRL+D or CTRL+Z" + | |
" and Return/Enter again to finish.)") | |
text = sys.stdin.read() | |
elif sys.argv[1:]: | |
print('Reading data...') | |
text = open(sys.argv[1]).read() | |
else: | |
text = 'banana\nabababab' | |
print('Sorting...') | |
global cache | |
cache = False | |
result = longest_common_substring(text) | |
print('Longest common substrings in "{0}..." are:\n{1}'.format( | |
text[:20], result)) | |
result = LRS(text) | |
print('Top (at most) 10, atleast 3 times repeated longest' + | |
' non-overlapping substrings are:\n{0}'.format(result)) | |
def LRS(text, m=3, n=10): # Longest repeated non-overlapping substrings | |
""" | |
Returns longest (maximum n number of) non-overlapping (minimum m times) | |
repeated strings and how many times they are found in the text in | |
descending order of count*len(substring)^2. | |
""" | |
sa, rsa, lcp = suffix_array(text) | |
result = {} | |
for i in range(0, len(sa)): | |
l = lcp[i] | |
if l < 2: # Don't bother with individual characters. | |
continue | |
substring = text[sa[i]:sa[i] + l] | |
c = text.count(substring) # This is the bottleneck. | |
if c < m: # Not found enough times. | |
continue | |
halt = False | |
for s in list(result.keys()): # Filter out overlaps. | |
if len(s) >= l: | |
if substring in s: | |
halt = True | |
break | |
elif s in substring: | |
del result[s] | |
if not halt: | |
result[substring] = c | |
return dict(sorted(result.items(), | |
key=lambda x: x[1]*(len(x[0])**2), reverse=True)[:n]) | |
def longest_common_substring(text): | |
"""Get the longest common substrings and their positions. | |
>>> longest_common_substring('banana') | |
{'ana': [1, 3]} | |
>>> text = "not so Agamemnon, who spoke fiercely to " | |
>>> sorted(longest_common_substring(text).items()) | |
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])] | |
This function can be easy modified for any criteria, e.g. for searching ten | |
longest non overlapping repeated substrings. | |
""" | |
sa, rsa, lcp = suffix_array(text) | |
maxlen = max(lcp) | |
result = {} | |
for i in range(1, len(text)): | |
if lcp[i] == maxlen: | |
j1, j2, h = sa[i - 1], sa[i], lcp[i] | |
assert text[j1:j1 + h] == text[j2:j2 + h] | |
substring = text[j1:j1 + h] | |
if substring not in result: | |
result[substring] = [j1] | |
result[substring].append(j2) | |
return dict((k, sorted(v)) for k, v in result.items()) | |
def suffix_array(text, _step=16): | |
global cache | |
if cache and "unittest" not in sys.modules: | |
return cache | |
"""Analyze all common strings in the text. | |
Short substrings of the length _step a are first pre-sorted. The are the | |
results repeatedly merged so that the garanteed number of compared | |
characters bytes is doubled in every iteration until all substrings are | |
sorted exactly. | |
Arguments: | |
text: The text to be analyzed. | |
_step: Is only for optimization and testing. It is the optimal length | |
of substrings used for initial pre-sorting. The bigger value is | |
faster if there is enough memory. Memory requirements are | |
approximately (estimate for 32 bit Python 3.3): | |
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB | |
Return value: (tuple) | |
(sa, rsa, lcp) | |
sa: Suffix array for i in range(1, size): | |
assert text[sa[i-1]:] < text[sa[i]:] | |
rsa: Reverse suffix array for i in range(size): | |
assert rsa[sa[i]] == i | |
lcp: Longest common prefix for i in range(1, size): | |
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]] | |
if sa[i-1] + lcp[i] < len(text): | |
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]] | |
>>> suffix_array(text='banana') | |
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' | |
The Longest Common String is 'ana': lcp[2] == 3 == len('ana') | |
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:] | |
""" | |
tx = text | |
t0 = time.time() | |
size = len(tx) | |
step = min(max(_step, 1), len(tx)) | |
sa = list(range(len(tx))) | |
log.debug('%6.3f pre sort', time.time() - t0) | |
sa.sort(key=lambda i: tx[i:i + step]) | |
log.debug('%6.3f after sort', time.time() - t0) | |
grpstart = size * [False] + [True] # a boolean map for iteration speedup. | |
# It helps to skip yet resolved values. The last value True is a sentinel. | |
rsa = size * [None] | |
stgrp, igrp = '', 0 | |
for i, pos in enumerate(sa): | |
st = tx[pos:pos + step] | |
if st != stgrp: | |
grpstart[igrp] = (igrp < i - 1) | |
stgrp = st | |
igrp = i | |
rsa[pos] = igrp | |
sa[i] = pos | |
grpstart[igrp] = (igrp < size - 1 or size == 0) | |
log.debug('%6.3f after group', time.time() - t0) | |
while grpstart.index(True) < size: | |
# assert step <= size | |
nmerge = 0 | |
nextgr = grpstart.index(True) | |
while nextgr < size: | |
igrp = nextgr | |
nextgr = grpstart.index(True, igrp + 1) | |
glist = [] | |
for ig in range(igrp, nextgr): | |
pos = sa[ig] | |
if rsa[pos] != igrp: | |
break | |
newgr = rsa[pos + step] if pos + step < size else -1 | |
glist.append((newgr, pos)) | |
glist.sort() | |
for ig, g in groupby(glist, key=itemgetter(0)): | |
g = [x[1] for x in g] | |
sa[igrp:igrp + len(g)] = g | |
grpstart[igrp] = (len(g) > 1) | |
for pos in g: | |
rsa[pos] = igrp | |
igrp += len(g) | |
nmerge += len(glist) | |
log.debug('%6.3f for step=%d nmerge=%d', time.time() - t0, step, nmerge) | |
step *= 2 | |
del grpstart | |
# create LCP array | |
lcp = size * [0] | |
h = 0 | |
for i in range(size): | |
if rsa[i] > 0: | |
j = sa[rsa[i] - 1] | |
while i != size - h and j != size - h and tx[i + h] == tx[j + h]: | |
h += 1 | |
lcp[rsa[i]] = h | |
if h > 0: | |
h -= 1 | |
log.debug('%6.3f end', time.time() - t0) | |
cache = (sa, rsa, lcp) | |
return cache | |
# --- | |
class TestMixin(object): | |
def suffix_verify(self, text, step=16): | |
tx = text | |
sa, rsa, lcp = suffix_array(text=tx, _step=step) | |
self.assertEqual(set(sa), set(range(len(tx)))) | |
ok = True | |
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]): | |
self.assertEqual(tx[i1:i1 + h], tx[i0:i0 + h], "Verify LCP characters equal on text '%s...'" % text[:20]) | |
self.assertGreater(tx[i1 + h:i1 + h + 1], tx[i0 + h:i0 + h + 1], | |
"Verify LCP+1 char is different '%s...'" % text[:20]) | |
self.assertLessEqual(max(i0, i1), len(tx) - h, | |
"Verify LCP is not more than length of string '%s...'" % text[:20]) | |
self.assertTrue(ok) | |
class SuffixArrayTest(unittest.TestCase, TestMixin): | |
def test_16(self): | |
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' | |
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
self.assertEqual(suffix_array(text='banana', _step=16), expect) | |
def test_1(self): | |
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) | |
self.assertEqual(suffix_array(text='banana', _step=1), expect) | |
def test_mini(self): | |
self.assertEqual(suffix_array(text='', _step=1), ([], [], [])) | |
self.assertEqual(suffix_array(text='a', _step=1), ([0], [0], [0])) | |
self.assertEqual(suffix_array(text='aa', _step=1), ([1, 0], [1, 0], [0, 1])) | |
self.assertEqual(suffix_array(text='aaa', _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2])) | |
def test_example(self): | |
self.suffix_verify('abracadabra') | |
def test_cartesian(self): | |
"""Test all combinations of alphabet "ABC" up to length 4 characters""" | |
for size in range(7): | |
for cartesian in itertools.product(*(size * ['ABC'])): | |
text = ''.join(cartesian) | |
log.debug('Testing "%s"', text) | |
self.suffix_verify(text, 1) | |
def test_lcp(self): | |
expect = {'ana': [1, 3]} | |
self.assertDictEqual(longest_common_substring('banana'), expect) | |
expect = {' s': [3, 21], 'no': [0, 13], 'o ': [5, 20, 38]} | |
self.assertDictEqual(longest_common_substring( | |
"not so Agamemnon, who spoke fiercely to "), expect) | |
class SlowTests(unittest.TestCase, TestMixin): | |
"""Slow development tests running many minutes. | |
It can be run only by an EXPLICIT command! | |
e.g.: python -m unittest maxsubstring.SlowTests._test_random | |
""" | |
def _test_random(self): | |
for power in range(2, 21, 2): | |
size = randint(2 ** (power - 1), 2 ** power) | |
for alphabet in (2, 4, 16, 256): | |
text = ''.join(chr(65 + randint(0, alphabet - 1)) for _ in range(size)) | |
log.debug('%s %s %s', size, alphabet, 1) | |
self.suffix_verify(text, 1) | |
log.debug('%s %s %s', size, alphabet, 16) | |
self.suffix_verify(text, 16) | |
if __name__ == '__main__': | |
run() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// From: https://gist.github.com/hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c | |
// (Minify with: https://javascript-minifier.com/) | |
// Original from: https://github.com/tixxit/suffixarray/blob/master/suffixarray.js | |
/** | |
* An implementation of the linear time suffix array construction of | |
* Karkkainen & Sanders: | |
* | |
* "Simple Linear Work Suffix Array Construction", Karkainen and Sanders. | |
* | |
* Creating a suffix array is very simple; just call suffixArray(...) with | |
* either a string or a function that returns integers and its length. For | |
* example, | |
* | |
* var s = "Sort this!"; | |
* suffixArray(s); // Returns [4, 9, 0, 6, 7, 1, 2, 8, 3, 5]. | |
* | |
* function reverse(i) { return s.charCodeAt(s.length - 1 - i) } | |
* suffixArray(reverse, s.length); // Returns [5, 0, 9, 3, 2, 8, 7, 1, 4, 6]. | |
* | |
* @author Thomas Switzer | |
*/ | |
(function() { | |
var global = this, | |
floor = Math.floor, | |
identity = function(x) { return x }, | |
undefined; | |
/** | |
* Sorts an array of (unsigned) integers in linear time. The values of the | |
* array (a) act as keys, which are passed to the key function which returns an | |
* integer value. | |
* | |
* @param a An array of keys to sort. | |
* @param key A function that maps keys to integer values. | |
* @return The array a. | |
*/ | |
function bsort(a, key) { | |
var len = a.length, | |
buckets = [], | |
i = len, j = -1, b, d = 0, | |
keys = 0, | |
bits; | |
key = key || identity; | |
while (i--) | |
j = Math.max(key(a[i]), j); | |
bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8; | |
for (; d < bits; d += 4) { | |
for (i = 16; i--;) | |
buckets[i] = []; | |
for (i = len; i--;) | |
buckets[(key(a[i]) >> d) & 15].push(a[i]); | |
for (b = 0; b < 16; b++) | |
for (j = buckets[b].length; j--;) | |
a[++i] = buckets[b][j]; | |
} | |
return a; | |
} | |
function isInt(n) { | |
return typeof n == "number" || n instanceof Number; | |
} | |
function isStr(s) { | |
return Object.prototype.toString.call(s) == "[object String]"; | |
} | |
function wrap(s) { | |
return typeof s == "function" ? s : (isStr(s) | |
? function(i) { return s.charCodeAt(i) } | |
: function(i) { return s[i] }); | |
} | |
/** | |
* Returns the suffix array of the string s. The suffix array is constructed | |
* in linear time. Also returns Longest common prefix array. | |
* | |
* The string s can either be an Unicode string (ie. JavaScript String object) | |
* or a function that takes an index (integer >= 0) and returns another | |
* integer (a "symbol"). If a function is provided, then another argument | |
* specifying its length (integer >= 0) must be provided. | |
* | |
* This also takes a 3rd optional parameter that dictates how to treat the end | |
* of the string. This can be either "min" or "wrap". If it is "min", then | |
* characters after the end of the string are treated as 0's (the minimum). | |
* If "wrap" is given, then the end of the string wraps back around to the | |
* beginning. If this parameter is omitted, then "wrap" is assumed. | |
* | |
* In the case of strings, you can omit the 2nd paramter (length) and still | |
* provide the 3rd paramter. For instance, suffixArray(str, "min"). | |
* | |
* The returned array contains the indexes of the string in the lexicographical | |
* order of the suffixes that start at those indexes. | |
* | |
* @param s A string or function that maps ints between [0, len) to integers. | |
* @param len The length of s (optional if s is a string, required otherwise). | |
* @param end Either "min", "wrap" or leave out (defaults to "wrap"). | |
* @return An array of indexes into s. | |
*/ | |
global.suffixArray = function(s, len, end) { | |
end = end || len; | |
len = isInt(len) ? len : s.length; | |
if (end == "wrap") | |
return wrappedSuffixArray(s, len); | |
else | |
return _suffixArray(wrap(s), len); | |
} | |
/** | |
* Longest repeated non-overlapping substrings: | |
* Returns longest (maximum n number of) non-overlapping (minimum m times) | |
* repeated strings and how many times they are found in the text in | |
* descending order of count*len(substring)^2. | |
*/ | |
global.LRS = function(text, m, n) { | |
m = m || 3; | |
n = n || 10; | |
nots = (text.constructor===Uint8Array); | |
decoder = new TextDecoder(); | |
var [sa, lcp] = suffixArray(text); | |
var result = {}, l, c, halt, | |
t = nots?decoder.decode(text):text; | |
for (var i = 0; i < sa.length; i++) { | |
l = lcp[i]; | |
if (l < 2) continue; // Don't bother with individual characters. | |
substring = nots?text.subarray(sa[i], sa[i] + l):text.substring(sa[i], sa[i] + l); // Subarray is very slow. :( | |
substring = nots?decoder.decode(substring):substring; | |
c = t.split(substring).length - 1; // This is slow too. | |
if (c < m) continue; // Not found enough times. | |
halt = false; | |
for (s in result) // Filter out overlaps. | |
if (s.length >= l) { | |
if (s.includes(substring)) { | |
halt = true; | |
break; | |
} | |
} else if (substring.includes(s)) | |
delete result[s]; | |
if (!halt) result[substring] = [c, sa[i]]; | |
} | |
return Object.entries(result).sort((a, b) => | |
(b[1][0]*(b[0].length**2))-(a[1][0]*(a[0].length**2))) | |
.slice(0, n).reduce((o, k) => { | |
o[k[0]] = result[k[0]]; | |
return o; | |
}, {}); | |
} | |
// Export the Bucket Sort. | |
global.suffixArray.bsort = bsort; | |
/** | |
* Constructs the suffix array of s. It takes either a string, an array, or a | |
* function that takes an integer and returns a unsigned integer. It also takes | |
* an optional 2nd paramter, the length. This is required if the first | |
* parameter is a function. | |
* | |
* This uses the nice idea from Karkkainen & Sander's paper of replacing each | |
* letter with the equivalent k-letter version (3 in their paper, 2 in this | |
* algorithm). This is repeated recursively until all the letters are | |
* different. This doesn't have the nice 1/3 pruning / merge step of their | |
* algorithm, but still performs relatively fast, running in O(n log n). | |
* | |
* @param s A string, array, or function. | |
* @param len The length of s. | |
* @return The order of the suffixes. | |
*/ | |
function wrappedSuffixArray(s, len) { | |
len = isInt(len) ? len : s.length; | |
s = wrap(s); | |
var array = [], | |
swap = [], | |
order = [], | |
span, | |
sym, | |
i = len; | |
while (i--) | |
array[i] = s(order[i] = i); | |
for (span = 1; sym != len && span < len; span *= 2) { | |
bsort(order, function(i) { return array[(i + span) % len] }); | |
bsort(order, function(i) { return array[i] }); | |
sym = swap[order[0]] = 1; | |
for (i = 1; i < len; i++) { | |
if (array[order[i]] != array[order[i - 1]] || array[(order[i] + span) % len] != array[(order[i - 1] + span) % len]) | |
sym++; | |
swap[order[i]] = sym; | |
} | |
tmp = array; | |
array = swap; | |
swap = tmp; | |
} | |
return order; | |
} | |
/* Constructs the suffix array of s. In this case, s must be a function that | |
* maps integers between 0 and len - 1 to "symbols" (unsigned integers). It | |
* returns the suffixes in lexicographical order as an array of indexes where | |
* those suffixes start. Also returns Longest common prefix array. | |
* | |
* I have tried to keep the code reasonably well commented. Both for my sake, | |
* and yours. That said, my code was not written with pedagogy in mind, but | |
* to be relatively fast and have a small minified size. | |
* | |
* The description of the algorithm in the paper is very concise and is well | |
* worth a read. | |
* | |
* The C code accompanying the paper is very terse and, IMHO, creates more | |
* confusion than clarity. While the algorithm itself is fairly simple (simple | |
* and fast, who wants more?), it does deal with quite a bit of abstraction. | |
* That is, you are dealing with a lot of placeholders, rather than concrete | |
* objects; indexes into the string to represent suffixes, lexical names | |
* representing triplets of symbols, indexes of these lexical names, etc. | |
*/ | |
function _suffixArray(_s, len, sub) { | |
var a = [], | |
b = [], | |
alen = floor(2 * len / 3), // Number of indexes s.t. i % 3 != 0. | |
blen = len - alen, // Number of indexes s.t. i % 3 = 0. | |
r = (alen + 1) >> 1, // Number of indexes s.t. i % 3 = 1. | |
i = alen, | |
j = 0, | |
k, | |
lookup = [], | |
result = [], | |
tmp, cmp, | |
s; | |
if (len == 1) | |
return [[0], [0]]; | |
s = function(i) { return i >= len ? 0 : _s(i) }; | |
// Sort suffixes w/ indices % 3 != 0 by their first 3 symbols (triplets). | |
while (i--) | |
a[i] = ((i * 3) >> 1) + 1; // a = [1, 2, 4, 5, 7, 8, 10, 11, 13, ...] | |
for (i = 3; i--;) | |
bsort(a, function(j) { return s(i + j) }); | |
// Assign lexicographical names (j) to the triplets of consecutive symbols, | |
// s.t. the order of the lex. names match the lex. order of the triplets. | |
// Array b contains lex. names in the order they appear in s for i % 3 != 0 | |
j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 1; | |
for (i = 1; i < alen; i++) { | |
if (s(a[i]) != s(a[i-1]) || s(a[i] + 1) != s(a[i-1] + 1) || s(a[i] + 2) != s(a[i-1] + 2)) | |
j++; | |
b[floor(a[i] / 3) + (a[i] % 3 == 1 ? 0 : r)] = j; | |
} | |
// If all lex. names are unique, then a is already completely sorted. | |
if (j < alen) { | |
// Otherwise, recursively sort lex. names in b, then reconstruct the | |
// indexes of the sorted array b so they are relative to a. | |
b = _suffixArray(function(i) { return b[i] }, alen, true); | |
for (i = alen; i--;) | |
a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2); | |
} | |
// Create a reverse lookup table for the indexes i, s.t. i % 3 != 0. | |
// This table can be used to simply determine the sorted order of 2 | |
// suffixes whose indexes are both not divisible by 3. | |
for (i = alen; i--;) | |
lookup[a[i]] = i; | |
lookup[len] = -1; | |
lookup[len + 1] = -2; | |
/** | |
* This is a comparison function for the suffixes at indices m & n that | |
* uses the lookup table to shorten the searches. It assumes that | |
* n % 3 == 0 and m % 3 != 0. | |
*/ | |
cmp = function(m, n) { | |
return (s(m) - s(n)) || (m % 3 == 2 | |
? (s(m + 1) - s(n + 1)) || (lookup[m + 2] - lookup[n + 2]) | |
: (lookup[m + 1] - lookup[n + 1])) | |
}; | |
// Sort remaining suffixes (i % 3 == 0) using prev result (i % 3 != 0). | |
b = len % 3 == 1 ? [ len - 1 ] : []; | |
for (i = 0; i < alen; i++) | |
if (a[i] % 3 == 1) | |
b.push(a[i] - 1); | |
bsort(b, function(j) { return s(j) }); | |
// Merge a (i % 3 != 0) and b (i % 3 == 0) together. We only need to | |
// compare, at most, 2 symbols before we end up comparing 2 suffixes whose | |
// indices are both not divisible by 3. At this point, we can use the | |
// reverse lookup array to order them. | |
if (sub) { // No need for extra rsa and lcp processing in sub calls, only in the top level one. | |
for (i = 0, j = 0, k = 0; i < alen && j < blen;) | |
result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++]; | |
while (i < alen) | |
result[k++] = a[i++]; | |
while (j < blen) | |
result[k++] = b[j++]; | |
return result; | |
} | |
var rsa = [], lcp = [0], h = 0, i2, j2; | |
for (i = 0, j = 0, k = 0; i < alen && j < blen;) { | |
i2 = k; | |
result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++]; | |
rsa[result[i2]] = i2; | |
if (rsa[i2] > 0) { | |
j2 = result[rsa[i2] - 1]; | |
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++; | |
lcp[rsa[i2]] = h; | |
if (h > 0) h--; | |
} | |
} | |
while (i < alen) { | |
i2 = k; | |
result[k++] = a[i++]; | |
rsa[result[i2]] = i2; | |
if (rsa[i2] > 0) { | |
j2 = result[rsa[i2] - 1]; | |
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++; | |
lcp[rsa[i2]] = h; | |
if (h > 0) h--; | |
} | |
} | |
while (j < blen) { | |
i2 = k; | |
result[k++] = b[j++]; | |
rsa[result[i2]] = i2; | |
if (rsa[i2] > 0) { | |
j2 = result[rsa[i2] - 1]; | |
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++; | |
lcp[rsa[i2]] = h; | |
if (h > 0) h--; | |
} | |
} | |
return [result, lcp]; | |
} | |
}).call(); | |
if (typeof process === 'object') { | |
var fs = require('fs'); | |
var file = fs.readFileSync((process.argv.length > 2) ? process.argv[2] : 'suffixarray.js', 'utf8'); | |
console.log(LRS(file)); | |
} else { | |
if (typeof importScripts === 'function') addEventListener('message', e => postMessage(LRS(e.data))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment