Created
March 27, 2023 03:50
-
-
Save shhyou/484b4202bcfcec70b85e7227522bb111 to your computer and use it in GitHub Desktop.
An O(n lg^2 n) implementation of the Burrows-Wheeler transform using the doubling algorithm and an O(n|S|) implementation of the inverse transform using modified radix sort.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket/base | |
(require racket/list | |
racket/vector | |
racket/bytes | |
racket/match) | |
(provide | |
;; plain implementation of the Burrows-Wheeler transformation | |
(struct-out bwencode) | |
plain-bwt-encode | |
plain-bwt-decode | |
;; faster implementation | |
bwt-encode | |
bwt-decode | |
) | |
(module+ test | |
(require rackunit)) | |
(struct bwencode (sorted-index bytes) #:transparent) | |
(module+ test | |
#| | |
banana 0..5 | |
ananab 1..5 0..0 | |
nanaba 2..5 0..1 | |
anaban 2..5 0..2 | |
nabana 2..5 0..3 | |
abanan 5..5 0..4 | |
=sort=> | |
0 abana|n | |
1 anaba|n | |
2 anana|b | |
* 3 banan|a | |
4 naban|a | |
5 nanab|a | |
|# | |
(check-equal? | |
(plain-bwt-encode #"banana") | |
(bwencode 3 #"nnbaaa")) | |
;; https://en.wikipedia.org/wiki/Burrows-Wheeler_transform | |
(check-equal? | |
(plain-bwt-encode #"^BANANA|") | |
(bwencode 6 #"BNN^AA|A")) | |
(check-equal? | |
(plain-bwt-encode #"R") | |
(bwencode 0 #"R")) | |
(check-equal? | |
(plain-bwt-encode #"NU") | |
(bwencode 0 #"UN")) | |
(check-equal? | |
(plain-bwt-encode #"UN") | |
(bwencode 1 #"UN")) | |
(define concolic-abs | |
(bytes-append | |
#"Higher-order functions have become a staple of modern programming languages. " | |
#"However, such values stymie concolic testers, as the SMT solvers at their hearts " | |
#"are inherently first-order. This paper lays a formal foundations for concolic " | |
#"testing higher-order functional programs. Three ideas enable our results: " | |
#"(i) our tester considers only program inputs in a canonical form; (ii) it " | |
#"collects novel constraints from the evaluation of the canonical inputs to " | |
#"search the space of inputs with partial help from an SMT solver and " | |
#"(iii) it collects constraints from canonical inputs even when they are " | |
#"arguments to concretized calls. We prove that (i) concolic evaluation is sound " | |
#"with respect to concrete evaluation; (ii) modulo concretization and SMT solver " | |
#"incompleteness, the search for a counter-example succeeds if a user program has " | |
#"a bug and (iii) this search amounts to directed evolution of inputs targeting " | |
#"hard-to-reach corners of the program.")) | |
(define concolic-rot | |
(bytes-append | |
#"t:;;dd.nde...snrsefhmrgnyse,seadamett)erooorslhaosceesdyhslalpssrrgmsrlgess" | |
#"relmflfn))grf)sneensse)shryenlehrseoTTTsease,srccesfmf,htn)tsssahnsd i" | |
#"iiiiisrsrrrtodsrssmsmn. SSS MMM npeurrcmiccncvvvvrrrr xr lcccp" | |
#"teeeh t pehe h zuuudhl a iiiiii uacrarrunnn en nnneenneennne" | |
#"enrnirrroi eohmrvhihtercllhWhhlvrssshdbprllzterchvhhv trmvtddvpsvthhdhdvndt" | |
#"ugrntttrrlgrr w -hiooooo unnnariiooooonrcccttcc t tttttt t wg" | |
#"gnt TtT(iii(itlllnnn sm hHi((i(( ttm aattttttttedfhh wwtteaaaaaa p" | |
#"bpllpoooooaueluaaaaooootnooaaoarrouyma aaoaaiooooreeoeooooiooouuaaauureiii" | |
#"aiooooaaa iiiiioooooueiieutlttttmm rrrrrcccccvsssrrrcciiiiiiccccccnnn i" | |
#"icccff---ffcsfcm rnHls saamm nnnnnoeoeeeieeeuueeeeettgggggaaaaoooaa-" | |
#"ihe cccaaooeopppppfffpeeeeiaaaytrtatnttndttrriiietattttrsemlt un eer" | |
#" eeenn eaiiacss ecesns ii resaauaacaceen -ssrcunnucunnuulsg" | |
#"lllsslbdsgffoooooo lpppppeee aooelleleo eelltaii")) | |
(check-equal? | |
(plain-bwt-encode concolic-abs) | |
(bwencode 172 concolic-rot)) | |
(define letters-rep | |
(bytes-append* (make-list 1000 #"abcdefghijklmnopqrstuvwxyz"))) | |
(define letters-sorted | |
(bytes-append* | |
(for/list ([idx (in-range 26)]) | |
(make-bytes 1000 (+ 97 (modulo (+ idx 25) 26)))))) | |
(check-equal? | |
(time (displayln '(plain-bwt-encode letters-rep)) | |
(plain-bwt-encode letters-rep)) | |
(bwencode 0 letters-sorted))) | |
(define (plain-bwt-encode bs) | |
(define len (bytes-length bs)) | |
(define rotated-bss | |
(for/list ([i (in-range len)]) | |
(cons i (bytes-append (subbytes bs i) (subbytes bs 0 i))))) | |
(define sorted-bss | |
(sort rotated-bss bytes<? #:key cdr)) | |
(define bs-last-col (make-bytes len)) | |
(for ([i (in-range len)] | |
[idx-bs (in-list sorted-bss)]) | |
(bytes-set! bs-last-col i (bytes-ref (cdr idx-bs) (sub1 len)))) | |
(bwencode (index-where sorted-bss (λ (idx-bs) (zero? (car idx-bs)))) | |
(bytes->immutable-bytes bs-last-col))) | |
(module+ test | |
(check-equal? | |
(bwt-encode/substr-exp-sort #"banana") | |
(bwencode 3 #"nnbaaa")) | |
(check-equal? | |
(bwt-encode/substr-exp-sort #"^BANANA|") | |
(bwencode 6 #"BNN^AA|A")) | |
(check-equal? | |
(bwt-encode/substr-exp-sort #"R") | |
(bwencode 0 #"R")) | |
(check-equal? | |
(bwt-encode/substr-exp-sort #"NU") | |
(bwencode 0 #"UN")) | |
(check-equal? | |
(bwt-encode/substr-exp-sort #"UN") | |
(bwencode 1 #"UN")) | |
(check-equal? | |
(bwt-encode/substr-exp-sort concolic-abs) | |
(bwencode 172 concolic-rot)) | |
(check-equal? | |
(time (displayln '(bwt-encode/substr-exp-sort letters-rep)) | |
(bwt-encode/substr-exp-sort letters-rep)) | |
(bwencode 0 letters-sorted))) | |
(struct rotslice (start len [rank #:mutable]) #:transparent) | |
(define (bwt-encode/substr-exp-sort bs) | |
(define len (bytes-length bs)) | |
(define (build-from-half-subsubstrs sublen/2 half-substrs-tbl slice) | |
(values (vector-ref half-substrs-tbl (rotslice-start slice)) | |
(vector-ref half-substrs-tbl (modulo (+ (rotslice-start slice) sublen/2) len)))) | |
(define (cmp-1byte sl1 sl2) | |
(< (bytes-ref bs (rotslice-start sl1)) (bytes-ref bs (rotslice-start sl2)))) | |
(define substrs1 | |
(build-vector len (λ (i) (rotslice i 1 #f)))) | |
(fill-slice-ranks! (sort (vector->list substrs1) cmp-1byte) cmp-1byte) | |
(define substrs-table | |
(cond | |
[(<= (bytes-length bs) 1) | |
(vector substrs1)] | |
[else | |
(list->vector | |
(cons substrs1 | |
(let loop ([sublen/2 1] [substrs/2 substrs1]) | |
(define new-substrs | |
(build-vector len (λ (i) (rotslice i (+ sublen/2 sublen/2) #f)))) | |
(define (cmp-sublen sl1 sl2) | |
(define-values (subsl1-1 subsl1-2) | |
(build-from-half-subsubstrs sublen/2 substrs/2 sl1)) | |
(define-values (subsl2-1 subsl2-2) | |
(build-from-half-subsubstrs sublen/2 substrs/2 sl2)) | |
(if (not (= (rotslice-rank subsl1-1) (rotslice-rank subsl2-1))) | |
(< (rotslice-rank subsl1-1) (rotslice-rank subsl2-1)) | |
(< (rotslice-rank subsl1-2) (rotslice-rank subsl2-2)))) | |
(define largest-rank | |
(fill-slice-ranks! (sort (vector->list new-substrs) cmp-sublen) cmp-sublen)) | |
(cons new-substrs | |
(cond [(= (+ 1 largest-rank) len) '()] | |
[(<= (* sublen/2 4) len) (loop (* sublen/2 2) new-substrs)] | |
[else '()])))))])) | |
(define max-sublen | |
(arithmetic-shift 1 (- (vector-length substrs-table) 1))) | |
(define (rots-cmp rot1 rot2) | |
(let cmploop ([pos 0] [i (sub1 (vector-length substrs-table))] [sublen max-sublen]) | |
(cond | |
[(= pos len) #f] | |
[(<= (+ pos sublen) len) | |
(define substrs (vector-ref substrs-table i)) | |
(define substr1 (vector-ref substrs (modulo (+ (rotslice-start rot1) pos) len))) | |
(define substr2 (vector-ref substrs (modulo (+ (rotslice-start rot2) pos) len))) | |
(if (not (= (rotslice-rank substr1) (rotslice-rank substr2))) | |
(< (rotslice-rank substr1) (rotslice-rank substr2)) | |
(cmploop (+ pos sublen) (sub1 i) (arithmetic-shift sublen -1)))] | |
[else | |
(cmploop pos (sub1 i) (arithmetic-shift sublen -1))]))) | |
(define sorted-rots | |
(sort (build-list len (λ (i) (rotslice i len #f))) | |
rots-cmp)) | |
;; Compute the last column | |
(define bs-last-col (make-bytes len 0)) | |
(for ([i (in-range len)] | |
[slice (in-list sorted-rots)]) | |
(bytes-set! bs-last-col i (bytes-ref bs (modulo (+ len -1 (rotslice-start slice)) len)))) | |
(bwencode (index-where sorted-rots (λ (slice) (zero? (rotslice-start slice)))) | |
(bytes->immutable-bytes bs-last-col))) | |
(define (fill-slice-ranks! slices slice<) | |
(set-rotslice-rank! (car slices) 0) | |
(for/last ([curr-slice (cdr slices)] | |
[prev-slice (in-list slices)]) | |
(set-rotslice-rank! curr-slice (if (slice< prev-slice curr-slice) | |
(+ 1 (rotslice-rank prev-slice)) | |
(rotslice-rank prev-slice))) | |
(rotslice-rank curr-slice))) | |
(define bwt-encode bwt-encode/substr-exp-sort) | |
(module+ test | |
(check-equal? (plain-bwt-decode (bwencode 3 #"nnbaaa")) #"banana") | |
(check-equal? (plain-bwt-decode (bwencode 6 #"BNN^AA|A")) #"^BANANA|") | |
(check-equal? (plain-bwt-decode (bwencode 0 #"R")) #"R") | |
(check-equal? (plain-bwt-decode (bwencode 0 #"UN")) #"NU") | |
(check-equal? (plain-bwt-decode (bwencode 1 #"UN")) #"UN") | |
(let ([b #"the quick brown fox jumps over the lazy dog"]) | |
(check-equal? (plain-bwt-decode (plain-bwt-encode b)) b) | |
(check-equal? (plain-bwt-decode (plain-bwt-encode b) #:builtin-sort? #t) b)) | |
(check-equal? (plain-bwt-decode (bwencode 172 concolic-rot)) concolic-abs) | |
(check-equal? (time (displayln '(plain-bwt-decode (bwencode 0 letters-sorted))) | |
(plain-bwt-decode (bwencode 0 letters-sorted))) letters-rep)) | |
(define (plain-bwt-decode a-bwt #:builtin-sort? [builtin-sort? #f]) | |
(if builtin-sort? | |
(plain-bwt-decode/builtin-sort a-bwt) | |
(plain-bwt-decode/mat-radix-sort a-bwt))) | |
(define (plain-bwt-decode/builtin-sort a-bwt) | |
(define len (bytes-length (bwencode-bytes a-bwt))) | |
(define bs-mat | |
(for/fold ([bs-mat (build-list len (λ (ix) (make-bytes len)))]) | |
([end-ix (in-range len 0 -1)]) | |
(for ([bs (in-list bs-mat)] | |
[b (in-bytes (bwencode-bytes a-bwt))]) | |
(bytes-set! bs (sub1 end-ix) b)) | |
(sort bs-mat bytes<?))) | |
(list-ref bs-mat (bwencode-sorted-index a-bwt))) | |
(define (plain-bwt-decode/mat-radix-sort a-bwt) | |
(define len (bytes-length (bwencode-bytes a-bwt))) | |
(define bytes-idx/immutable | |
(vector->immutable-vector | |
(sorted-bytes-start-index (bwencode-bytes a-bwt)))) | |
(define bs-mat | |
(for/fold ([bs-mat (build-vector len (λ (ix) (make-bytes len)))]) | |
([end-ix (in-range len 0 -1)]) | |
(for ([bs (in-vector bs-mat)] | |
[b (in-bytes (bwencode-bytes a-bwt))]) | |
(bytes-set! bs (sub1 end-ix) b)) | |
;; one iteration of radix sort | |
(define bytes-idx (vector-copy bytes-idx/immutable)) | |
(define new-bs-mat (make-vector len #f)) | |
(for ([bs (in-vector bs-mat)] | |
[b (in-bytes (bwencode-bytes a-bwt))]) | |
(vector-set! new-bs-mat (vector-ref bytes-idx b) bs) | |
(vector-set! bytes-idx b (add1 (vector-ref bytes-idx b)))) | |
new-bs-mat)) | |
(vector-ref bs-mat (bwencode-sorted-index a-bwt))) | |
(module+ test | |
(check-equal? (bwt-decode (bwencode 3 #"nnbaaa")) #"banana") | |
(check-equal? (bwt-decode (bwencode 6 #"BNN^AA|A")) #"^BANANA|") | |
(check-equal? (bwt-decode (bwencode 0 #"R")) #"R") | |
(check-equal? (bwt-decode (bwencode 0 #"UN")) #"NU") | |
(check-equal? (bwt-decode (bwencode 1 #"UN")) #"UN") | |
(let ([b #"the quick brown fox jumps over the lazy dog"]) | |
(check-equal? (bwt-decode (plain-bwt-encode b)) b) | |
(check-equal? (bwt-decode (bwt-encode b)) b)) | |
(check-equal? (bwt-decode (bwencode 172 concolic-rot)) concolic-abs) | |
(check-equal? (time (displayln '(bwt-decode (bwencode 0 letters-sorted))) | |
(bwt-decode (bwencode 0 letters-sorted))) letters-rep)) | |
(define (bwt-decode a-bwt) | |
(match-define (struct* bwencode ([bytes input-bytes] | |
[sorted-index sorted-ix])) | |
a-bwt) | |
(define bytes-idx (sorted-bytes-start-index input-bytes)) | |
(define len (bytes-length input-bytes)) | |
;; `permute-index` stores how sorting some bytes `bs` permutates the bytes such that | |
;; (forall i. bs'[permute-index[i]] = bs[i]) <=> bs' = sort(bs), or equivalently | |
;; forall i. sort(bs)[i] = bs[permute-index^{-1}[i]] | |
(define permute-index | |
(for/vector #:length len ([b (in-bytes input-bytes)]) | |
(define ix (vector-ref bytes-idx b)) | |
(vector-set! bytes-idx b (add1 (vector-ref bytes-idx b))) | |
ix)) | |
(define output-bytes (make-bytes len)) | |
(for/fold ([sorted-ix sorted-ix]) | |
([i (in-range len)]) | |
(bytes-set! output-bytes (- len i 1) (bytes-ref input-bytes sorted-ix)) | |
(vector-ref permute-index sorted-ix)) | |
output-bytes) | |
(define (sorted-bytes-start-index bs | |
#:output-bytes-count! [bytes-count (make-vector 256 0)] | |
#:output-bytes-index! [bytes-idx (make-vector 256 0)]) | |
(for ([b (in-bytes bs)]) | |
(vector-set! bytes-count b (add1 (vector-ref bytes-count b)))) | |
(for/fold ([offset 0]) | |
([i (in-range 0 256)]) | |
(vector-set! bytes-idx i offset) | |
(+ offset (vector-ref bytes-count i))) | |
bytes-idx) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment