Created
April 9, 2024 05:40
-
-
Save jjrv/ac616cea4c039b8c94b4fdc354e063bc to your computer and use it in GitHub Desktop.
Histogram diff in TypeScript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { LCS, Interned } from './lcs'; | |
export class Histogram<Token> { | |
constructor(before: Token[], after: Token[]) { | |
let prefix = 0; | |
let beforeEnd = before.length; | |
let afterEnd = after.length; | |
// Skip common prefix. | |
while(prefix < beforeEnd && prefix < afterEnd && before[prefix] == after[prefix]) ++prefix; | |
// Skip common suffix. | |
while(beforeEnd > prefix && afterEnd > prefix) { | |
if(before[--beforeEnd] != after[--afterEnd]) { | |
++beforeEnd; | |
++afterEnd; | |
break; | |
} | |
} | |
const beforeCount = beforeEnd - prefix; | |
const afterCount = afterEnd - prefix; | |
this.before = new Uint32Array(beforeCount); | |
this.after = new Uint32Array(afterCount); | |
// Intern tokens. | |
const tokenMap = this.tokenMap; | |
let tokens = 1; | |
for(let pos = 0; pos < beforeCount; ++pos) { | |
const token = before[pos + prefix]; | |
this.before[pos] = tokenMap.get(token) || (tokenMap.set(token, tokens), tokens++); | |
} | |
for(let pos = 0; pos < afterCount; ++pos) { | |
const token = after[pos + prefix]; | |
this.after[pos] = tokenMap.get(token) || (tokenMap.set(token, tokens), tokens++); | |
} | |
// Allocate histogram buckets. | |
this.tokenBuckets = new Uint32Array(tokens); | |
this.tokenCounts = new Uint32Array(tokens + 2); | |
this.offsets = { data: new Uint32Array(beforeCount), pos: 0, end: 0 }; | |
const lcs = new LCS(this); | |
lcs.beforePos = 0; | |
lcs.beforeEnd = beforeCount; | |
lcs.afterPos = 0; | |
lcs.afterEnd = afterCount; | |
this.prefix = prefix; | |
this.stack = [lcs]; | |
} | |
countOffsets(token: Interned) { | |
const tokenCounts = this.tokenCounts; | |
const bucket = this.tokenBuckets[token]; | |
return bucket && tokenCounts[bucket] - tokenCounts[bucket - 1]; | |
} | |
getOffsets(token: Interned) { | |
const tokenCounts = this.tokenCounts; | |
const bucket = this.tokenBuckets[token]; | |
const offsets = this.offsets; | |
offsets.pos = tokenCounts[bucket - 1]; | |
offsets.end = tokenCounts[bucket]; | |
return offsets; | |
} | |
populate(lcs: LCS) { | |
const tokens = this.before; | |
const tokenBuckets = this.tokenBuckets; | |
const tokenCounts = this.tokenCounts; | |
const tokenOffsets = this.offsets.data; | |
const prevEnd = this.end; | |
// Clear old bucket assignments. | |
for(let pos = this.start; pos < prevEnd; ++pos) { | |
const token = tokens[pos]; | |
tokenBuckets[token] = 0; | |
} | |
const start = lcs.beforePos; | |
const end = lcs.beforeEnd; | |
this.start = start; | |
this.end = end; | |
let buckets = 1; | |
tokenCounts[0] = 0; | |
tokenCounts[1] = 0; | |
// Assign buckets for unique tokens. | |
for(let pos = start; pos < end; ++pos) { | |
const token = tokens[pos]; | |
if(!tokenBuckets[token]) { | |
tokenBuckets[token] = buckets++; | |
tokenCounts[buckets] = 0; | |
} | |
} | |
// Count bucket sizes. | |
for(let pos = start; pos < end; ++pos) { | |
++tokenCounts[tokenBuckets[tokens[pos]] + 1]; | |
} | |
// Turn bucket sizes into cumulative sums. | |
let count = 0; | |
for(let bucket = 2; bucket < buckets; ++bucket) { | |
tokenCounts[bucket] = count += tokenCounts[bucket]; | |
} | |
// Store input offset of each bucket member. | |
for(let pos = start; pos < end; ++pos) { | |
tokenOffsets[tokenCounts[tokenBuckets[tokens[pos]]]++] = pos; | |
} | |
} | |
step(): LCS | null { | |
let stackPos = this.stackPos; | |
if(stackPos < 0) return null; | |
const stack = this.stack; | |
const lcs = stack[stackPos]!; | |
stack[stackPos] = this.lcs; | |
stackPos--; | |
while(lcs.beforePos != lcs.beforeEnd && lcs.afterPos != lcs.afterEnd) { | |
this.populate(lcs); | |
if(!lcs.find()) { | |
throw new Error('Should fall back to Myers'); | |
} | |
if(lcs.len === 0) break; | |
++stackPos; | |
const next = stack[stackPos] || (stack[stackPos] = new LCS(this)); | |
next.beforePos = lcs.beforeNext + lcs.len; | |
next.beforeEnd = lcs.beforeEnd; | |
next.afterPos = lcs.afterNext + lcs.len; | |
next.afterEnd = lcs.afterEnd; | |
lcs.beforeEnd = lcs.beforeNext; | |
lcs.afterEnd = lcs.afterNext; | |
} | |
this.lcs = lcs; | |
this.stackPos = stackPos; | |
const prefix = this.prefix; | |
lcs.beforePos += prefix; | |
lcs.beforeEnd += prefix; | |
lcs.afterPos += prefix; | |
lcs.afterEnd += prefix; | |
return lcs; | |
} | |
private lcs?: LCS; | |
private stack: (LCS | undefined)[]; | |
private stackPos = 0; | |
private tokenMap = new Map<Token, Interned>(); | |
private tokenBuckets: Uint32Array; | |
private tokenCounts: Uint32Array; | |
private start = 0; | |
private end = 0; | |
before: Uint32Array; | |
after: Uint32Array; | |
prefix = 0; | |
offsets: { data: Uint32Array, pos: number, end: number }; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import type { Histogram } from './Histogram'; | |
export const MAX_CHAIN_LEN = 63; | |
/** Interned tokens are numbers to use as array indices. */ | |
export type Interned = number; | |
/** Longest common subsequence */ | |
export class LCS { | |
constructor(private histogram: Histogram<unknown>) { | |
this.reset(); | |
} | |
find(): boolean { | |
const histogram = this.histogram; | |
const after = histogram.after; | |
const afterEnd = this.afterEnd; | |
let found = false; | |
this.reset(); | |
this.beforeNext = this.beforePos; | |
this.afterNext = this.afterPos; | |
for(let afterOffset = this.afterPos; afterOffset < afterEnd; ++afterOffset) { | |
const token = after[afterOffset]; | |
const count = histogram.countOffsets(token); | |
if(count) { | |
found = true; | |
if(count <= this.minCount) afterOffset = this.update(afterOffset, token) - 1; | |
} | |
} | |
return !found || this.minCount <= MAX_CHAIN_LEN; | |
} | |
reset() { | |
this.len = 0; | |
this.minCount = MAX_CHAIN_LEN + 1; | |
} | |
update(afterOffset: number, token: Interned): number { | |
const { beforePos, beforeEnd, afterPos, afterEnd } = this; | |
const histogram = this.histogram; | |
const before = histogram.before; | |
const after = histogram.after; | |
let { data: offsets, pos: offsetPos, end: offsetEnd } = histogram.getOffsets(token); | |
const offsetCount = offsetEnd - offsetPos; | |
let beforeOffset = offsets[offsetPos++]; | |
let posNext = afterOffset + 1; | |
while(offsetPos <= offsetEnd) { | |
let count = offsetCount; | |
let beforeNext = beforeOffset; | |
let afterNext = afterOffset; | |
while(beforeNext > beforePos && afterNext > afterPos && before[beforeNext - 1] == after[afterNext - 1]) { | |
--beforeNext; | |
--afterNext; | |
count = Math.min(count, histogram.countOffsets(before[beforeNext])); | |
} | |
let beforeEndNext = beforeOffset + 1; | |
let afterEndNext = afterOffset + 1; | |
while(beforeEndNext < beforeEnd && afterEndNext < afterEnd && before[beforeEndNext] == after[afterEndNext]) { | |
count = Math.min(count, histogram.countOffsets(before[beforeEndNext])); | |
++beforeEndNext; | |
++afterEndNext; | |
} | |
if(posNext < afterEndNext) posNext = afterEndNext; | |
const len = afterEndNext - afterNext; // Equals beforeEndNext - beforeNext | |
if(this.len < len || this.minCount > count) { | |
this.minCount = count; | |
this.beforeNext = beforeNext; | |
this.afterNext = afterNext; | |
this.len = len; | |
} | |
let offset: number; | |
while((offset = offsets[offsetPos++])) { | |
if(offset > afterEndNext) { | |
beforeOffset = offset; | |
break; | |
} | |
} | |
} | |
return posNext; | |
} | |
private minCount: number; | |
beforePos = 0; | |
beforeEnd = 0; | |
afterPos = 0; | |
afterEnd = 0; | |
beforeNext = 0; | |
afterNext = 0; | |
len: number; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is based on imara-diff:
https://github.com/pascalkuthe/imara-diff
Any mistakes in the TypeScript version are mine, not very much tested yet.