Last active
August 27, 2021 15:59
-
-
Save nvictus/460a6014a075156713fea5e09bdf3498 to your computer and use it in GitHub Desktop.
coolR: a cooler reader for R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Notes | |
# ----- | |
# * Cooler's stored bin IDs are 0-based. However, for consistency with R, this API should take | |
# 1-based indexing as input for table row and matrix range queries. | |
# * See Ilya's implementation: https://github.com/dozmorovlab/HiCcompare/issues/9 | |
library(hdf5r) | |
library(dplyr) | |
library(tibble) | |
library(purrr) | |
library(data.table) | |
library(Matrix) | |
library(slam) | |
library(R6) | |
#' Return the search indexes of a cooler | |
#' | |
#' @param clr H5File or H5Group bearing a cooler data collection. | |
read_indexes <- function(clr) { | |
chrom_offset <- clr[['indexes/chrom_offset']][] + 1 | |
bin1_offset <- clr[['indexes/bin1_offset']][] + 1 | |
return (list("chrom_offset"=chrom_offset, | |
"bin1_offset"=bin1_offset)) | |
} | |
#' Query a columnar table stored as 1D arrays in an HDF5 group | |
#' | |
#' @param clr H5File or H5Group | |
#' @param path character. Path to the group containing the columns. | |
#' @param columns list. List of columns to read from. Default is all columns. | |
#' @param span pair. Range of rows of the table to read. Default is all rows. | |
read_table <- function(clr, path, columns=NULL, span=NULL) { | |
grp <- clr[[path]] | |
if (is.null(columns)) { | |
columns <- grp$names | |
} | |
if (is.null(span)) { | |
dset <- grp[[columns[1]]] | |
len <- dset$dims[1] | |
span <- c(1, len) | |
} | |
n_cols <- length(columns) | |
x <- vector(mode="list") | |
for (col in columns) { | |
vals <- grp[[col]][span[1]:span[2]] | |
if (is.factor_ext(vals)) { | |
vals <- as.character(vals) | |
} | |
x[[col]] <- vals | |
} | |
return (cbind.data.frame(x)) | |
} | |
#' Join bin information against a selection of pixels | |
#' | |
#' @param pixels data.frame. A table of pixel records containing bin1_id and | |
#' bin2_id columns. | |
#' @param bins data.frame. The entire bin table. | |
annotate <- function(pixels, bins) { | |
bins['idx'] <- as.numeric(rownames(bins)) | |
pixels <- inner_join(bins, pixels, by=c("idx" = "bin2_id")) | |
pixels <- pixels[,-which(names(pixels) == "idx")] | |
pixels <- inner_join(bins, pixels, by=c("idx" = "bin1_id"), suffix=c("1", "2")) | |
pixels <- pixels[,-which(names(pixels) == "idx")] | |
return (pixels) | |
} | |
.arg.prune.partition <- function(x, step) { | |
lo <- x[[1]] | |
hi <- x[[length((x))]] | |
num <- 2 + (hi - lo) / step | |
cuts <- seq(lo, hi, length.out=num) | |
return ( unique(findInterval(cuts, x)) ) | |
} | |
.comes.before <- function(a0, a1, b0, b1, strict=FALSE) { | |
if (a0 < b0) { | |
if (strict) { | |
return (a1 <= b0) | |
} else { | |
return (a1 <= b1) | |
} | |
} else { | |
return (FALSE) | |
} | |
} | |
.contains <- function(a0, a1, b0, b1, strict=FALSE) { | |
if (a0 > b0 || a1 < b1) { | |
return (FALSE) | |
} | |
if (strict && (a0 == b0 || a1 == b1)) { | |
return (FALSE) | |
} | |
return (a0 <= b0 && a1 >= b1) | |
} | |
#' 2D range query processor | |
#' | |
#' Designed to read the pixel data in manageable chunks. | |
#' | |
#' @section Public fields: | |
#' * `clr`: H5File or H5Group | |
#' * `i1`, `i2`: row range (1-based) | |
#' * `j1`, `j2`: column range (1-based) | |
#' * `field`: value column to read (default is "count") | |
#' * `chunksize`: number of pixel records per batch | |
RangeQuery <- R6Class("RangeQuery", | |
public = list( | |
clr=NULL, | |
i1=NULL, | |
i2=NULL, | |
j1=NULL, | |
j2=NULL, | |
field=NULL, | |
chunksize=NULL, | |
bin1_offsets=NULL, | |
loc_pruned_offsets=NULL, | |
n_chunks=NULL, | |
initialize = function(clr, i1, i2, j1, j2, field="count", chunksize=10000000) { | |
self$clr <- clr | |
self$i1 <- i1 | |
self$i2 <- i2 | |
self$j1 <- j1 | |
self$j2 <- j2 | |
self$field <- field | |
self$chunksize <- chunksize | |
indexes <- read_indexes(clr) | |
self$bin1_offsets <- indexes[["bin1_offset"]][self$i1:self$i2] | |
# subsample the offsets to produce batches of pixel records roughly | |
# `chunksize` in length corresponding to full matrix rows | |
self$loc_pruned_offsets <- .arg.prune.partition(self$bin1_offsets, chunksize) | |
self$n_chunks <- length(self$loc_pruned_offsets) - 1 | |
}, | |
read = function() { | |
out <- list() | |
for (i in seq_len(self$n_chunks)) { | |
out[[i]] <- self$read_chunk(i) | |
} | |
return (rbindlist(out)) | |
}, | |
read_chunk = function(chunk_id, mirror=FALSE) { | |
i1 <- self$i1 | |
i2 <- self$i2 | |
j1 <- self$j1 | |
j2 <- self$j2 | |
# extract data for a chunk of matrix rows | |
oi <- self$loc_pruned_offsets[chunk_id] | |
of <- self$loc_pruned_offsets[chunk_id + 1] | |
p1 <- self$bin1_offsets[oi] | |
p2 <- self$bin1_offsets[of] - 1 | |
bin2_extracted <- clr[['pixels/bin2_id']][p1:p2] | |
data_extracted <- clr[['pixels/count']][p1:p2] | |
# iterate over matrix rows and filter | |
dfs <- list() | |
for (i in seq(oi, of-1)) { | |
# correct the offsets | |
lo <- self$bin1_offsets[i] - p1 + 1 | |
hi <- self$bin1_offsets[i + 1] - p1 | |
if ((hi - lo) > 0) { | |
# this row | |
bin2 <- bin2_extracted[lo:hi] + 1 # change to 1-based | |
datx <- data_extracted[lo:hi] | |
if (any(is.na(bin2))) { | |
print(length(bin2_extracted)) | |
print(length(data_extracted)) | |
print(self$bin1_offsets[i] - p1) | |
print(self$bin1_offsets[i + 1] - p1) | |
print(bin2) | |
print(datx) | |
} | |
# filter for the range of j values we want | |
mask <- ((bin2 >= j1) & (bin2 <= j2)) | |
cols <- bin2[mask] | |
data <- datx[mask] | |
rows <- rep(i1 + i - 1, length(cols)) # 1-based | |
dfs[[i]] <- data.frame( | |
bin1_id=rows, | |
bin2_id=cols, | |
count=data) | |
} else { | |
dfs[[i]] <- data.frame(row.names=c("bin1_id", "bin2_id", "count")) | |
} | |
} | |
df <- rbindlist(dfs) | |
if (mirror) { | |
nodiag <- df[["bin1_id"]] != df[["bin2_id"]] | |
df <- rbind(df, | |
rename(df[nodiag,], | |
bin1_id=bin2_id, bin2_id=bin1_id)) | |
} | |
return (df) | |
} | |
) | |
) | |
#' A mirroring 2D range query processor for a symmetric matrix stored as upper | |
#' triangular data. | |
#' | |
#' Designed to read the pixel data in manageable chunks and fill in the lower | |
#' triangular data. Mirroring requires breaking up the query into one or more | |
#' unmirrored subqueries. | |
#' | |
#' @section Public fields: | |
#' * `clr`: H5File or H5Group | |
#' * `i1`, `i2`: row range (1-based) | |
#' * `j1`, `j2`: column range (1-based) | |
#' * `field`: value column to read (default is "count") | |
#' * `chunksize`: number of pixel records per batch | |
SymmTriuRangeQuery <- R6Class("SymmTriuRangeQuery", | |
public = list( | |
clr=NULL, | |
i1=NULL, | |
i2=NULL, | |
j1=NULL, | |
j2=NULL, | |
field=NULL, | |
chunksize=NULL, | |
transpose=NULL, | |
mode=NULL, | |
queries=NULL, | |
initialize = function(clr, i1, i2, j1, j2, | |
field="count", chunksize=10000000) { | |
self$clr <- clr | |
self$field <- field | |
self$chunksize <- chunksize | |
self$transpose <- FALSE | |
if (i1 == j1 && i2 == j2) { | |
self$queries <- list( | |
RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize) | |
) | |
self$mode = 1 | |
} else { | |
if (j1 < i1 || (i1 == j1 && i2 < j2)) { | |
t <- i1; i1 <- j1; j1 <- t | |
t <- i2; i2 <- j2; j2 <- t | |
self$transpose <- TRUE | |
} | |
if (.comes.before(i1, i2, j1, j2, strict=TRUE)) { | |
self$queries <- list( | |
RangeQuery$new(clr, i1, i2, j1, j2, field, chunksize) | |
) | |
self$mode = 2 | |
} else if (.comes.before(i1, i2, j1, j2)) { | |
self$queries <- list( | |
RangeQuery$new(clr, i1, j1, j1, i2, field, chunksize), | |
RangeQuery$new(clr, j1, i2, j1, i2, field, chunksize), | |
RangeQuery$new(clr, i1, i2, i2, j2, field, chunksize) | |
) | |
self$mode = 3 | |
} else if (.contains(i1, i2, j1, j2)) { | |
self$queries <- list( | |
RangeQuery$new(clr, i1, j1, j1, j2, field, chunksize), | |
RangeQuery$new(clr, j1, j2, j1, j2, field, chunksize), | |
RangeQuery$new(clr, j1, j2, j2, i2, field, chunksize) | |
) | |
self$mode = 4 | |
} else { | |
stop("this shouldn't happen") | |
} | |
} | |
}, | |
read_chunk = function(q, i) { | |
#' @param q int. Index of range subquery object to use. | |
#' @param i int. Index of chunk to extract from subquery. | |
#' @return data.frame. | |
rq <- self$queries[[q]] | |
mirror <- if (self$mode == 1 || q == 2) TRUE else FALSE | |
out <- rq$read_chunk(i, mirror=mirror) | |
if (self$mode == 4 && q == 3) { | |
if (!self$transpose) { | |
out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id") | |
} | |
} else if (self$transpose) { | |
out <- rename(out, bin1_id="bin2_id", bin2_id="bin1_id") | |
} | |
return (out) | |
}, | |
read = function() { | |
#' Process the query | |
#' | |
#' @return data.frame | |
out <- list() | |
k <- 1 | |
n_queries <- length(self$queries) | |
for (q in seq_len(n_queries)) { | |
n_chunks <- self$queries[[q]]$n_chunks | |
for (i in seq_len(n_chunks)) { | |
out[[k]] <- self$read_chunk(q, i) | |
k <- k + 1 | |
} | |
} | |
return (rbindlist(out, use.names=TRUE)) | |
} | |
) | |
) | |
# table queries | |
clr <- H5File$new('~/tmp/coolers/Rao2014-GM12878-MboI-allreps-filtered.10kb.cool', mode='r') | |
bins <- read_table(clr, 'bins') | |
sel <- read_table(clr, 'pixels', span=c(1, 10)) | |
sel <- annotate(sel, bins) | |
# matrix range query | |
# TODO: here, bin IDs are returned as 1-based, which is inconsistent with the table queries above. | |
i1 <- 3000 | |
i2 <- 4000 | |
j1 <- 3000 | |
j2 <- 4000 | |
rq <- SymmTriuRangeQuery$new(clr, i1, i2, j1, j2) | |
df <- rq$read() | |
m <- sparseMatrix(i=df[['bin1_id']] - i1 + 1, | |
j=df[['bin2_id']] - j1 + 1, | |
x=df[['count']] * (bins[df[['bin1_id']], 'weight']) * (bins[df[['bin2_id']], 'weight']), | |
dims=c(i2-i1+1, j2-j1+1)) | |
M <- as.matrix(m) | |
image( | |
seq(j1, j2), | |
seq(i1, i2), | |
log10(t(M)), | |
useRaster=TRUE, | |
asp=1, | |
ylim=c(i2, i1), | |
col=hcl.colors(256, "YlOrRd", rev = TRUE) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment