Created
June 6, 2025 03:47
-
-
Save J-Moravec/a49b02c3244f1277ad75497a2868e7d7 to your computer and use it in GitHub Desktop.
tss
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
which_min = function(x){ | |
i = which.min(x) | |
m = x[i] | |
c("min" = m, "index" = i) | |
} | |
closest_tss = function(x, tss){ | |
# x needs to be a single row | |
if(nrow(x) != 1) | |
stop("\"x\" must have exactly single row") | |
# assume that tss was already split | |
tss = tss[[x[["chr"]]]] | |
# if NULL, return an empty data frame | |
if(is.null(tss)) | |
return(new_closest_tss()) | |
# consider only those with same sign | |
tss = tss[tss$strand == x[["strand"]],] | |
if(nrow(tss) == 0) | |
return(new_closest_tss()) | |
# surely there is a better way to do this? | |
min = list( | |
"start" = which_min(abs(x[["start"]] - tss$start)), | |
"end" = which_min(abs(x[["end"]] - tss$start)) | |
) | |
min = min[[which.min(c(min$start["min"], min$end["min"]))]] | |
tss = tss[min[["index"]],] | |
new_closest_tss(min[["min"]], tss$start, tss$ID) | |
} | |
find_closest_tss = function(x, tss){ | |
x$start = as.integer(x$start) | |
x$end = as.integer(x$end) | |
tss$start = as.integer(tss$start) | |
# split TSS to make subsetting for chromosomes easier | |
tss = split(tss, tss$chr) | |
# Processing in chunks to make it a bit easier | |
y = split(x, x$chr) | |
r = vector("list", length(y)) | |
names(r) = names(y) | |
for(i in seq_along(y)){ | |
yy = split(y[[i]], 1:nrow(y[[i]])) # super inefficient but simple | |
rr = lapply(yy, closest_tss, tss) | |
rr = do.call(what = rbind, rr) | |
r[[i]] = rr | |
} | |
do.call(what = rbind, r) | |
} | |
# Finds the shortest of elements of x to elements of y. | |
# Both x and y must be sorted | |
sort_dist_c = inline::cfunction( | |
sig = signature("x" = "SEXP", "y" = "SEXP"), | |
language = "C", | |
convention = ".Call", | |
body = ' | |
size_t lenx = XLENGTH(x); | |
size_t leny = XLENGTH(y); | |
if(!Rf_isInteger(x) || !Rf_isInteger(y)){ | |
Rf_error("\\"x\\" and \\"y\\" must be of the type \\"integer\\""); | |
} | |
SEXP id = PROTECT(Rf_allocVector(INTSXP, lenx)); | |
SEXP distance = PROTECT(Rf_allocVector(INTSXP, lenx)); | |
size_t i_y = 0; | |
int d = 0; | |
int d_new = 0; | |
for(size_t i_x = 0; i_x < lenx; i_x++){ | |
d = INTEGER(y)[i_y] - INTEGER(x)[i_x]; | |
while(i_y < (leny - 1)){ | |
d_new = INTEGER(y)[i_y + 1] - INTEGER(x)[i_x]; | |
if(abs(d) < abs(d_new)){ | |
// Current is smaller, all other will only be further away | |
break; | |
} | |
d = d_new; | |
i_y++; | |
} | |
INTEGER(id)[i_x] = i_y + 1; // turn to 1-index | |
INTEGER(distance)[i_x] = d; | |
} | |
SEXP res = PROTECT(Rf_allocVector(VECSXP, 2)); | |
SET_VECTOR_ELT(res, 0, id); | |
SET_VECTOR_ELT(res, 1, distance); | |
// define names and assign them to res | |
SEXP names = PROTECT(Rf_allocVector(STRSXP, 2)); | |
SET_STRING_ELT(names, 0, Rf_mkChar("index")); | |
SET_STRING_ELT(names, 1, Rf_mkChar("distance")); | |
Rf_setAttrib(res, R_NamesSymbol, names); | |
UNPROTECT(4); | |
return res; | |
' | |
) | |
new_closest_tss = function(distance, position, id){ | |
data.frame( | |
"tss_distance" = distance, | |
"tss_position" = position, | |
"tss_id" = id | |
) | |
} | |
new_empty_tss = function(n){ | |
data.frame( | |
"tss_distance" = rep(NA, n), | |
"tss_position" = NA, | |
"tss_id" = NA | |
) | |
} | |
# this is for a single chromosome | |
closest_tss2 = function(x, tss){ | |
# for sort_dist_c both x and tss need sorted | |
# use order(order(x)) to restore previous order after sorting | |
# reorder tss according to start | |
tss$start = as.integer(tss$start) | |
tss = tss[order(tss$start),] | |
# sort start, calculate distace, and restore original order | |
x$start = as.integer(x$start) | |
o = order(x$start) | |
oo = order(o) # original order | |
start_dist = sort_dist_c(x$start[o], tss$start) | |
start_dist$index[oo] | |
start_dist$distance[oo] | |
# sort end, calculate distance, and restore original order | |
x$end = as.integer(x$end) | |
o = order(x$end) | |
end_dist = sort_dist_c(x$end[o], tss$start) | |
end_dist$index[oo] | |
end_dist$distance[oo] | |
# calculate which which of start and end dist is smaller | |
select = abs(start_dist$distance) < abs(end_dist$distance) | |
index = ifelse(select, start_dist$index, end_dist$index) | |
distance = ifelse(select, start_dist$distance, end_dist$distance) | |
new_closest_tss( | |
abs(distance), | |
tss$start[index], | |
tss$ID[index] | |
) | |
} | |
find_closest_tss2 = function(x, tss, stranded = TRUE){ | |
x$start = as.integer(x$start) | |
x$end = as.integer(x$end) | |
tss$start = as.integer(tss$start) | |
# split tss and x by chromosome | |
tss = split(tss, tss$chr) | |
x = split(x, x$chr) | |
r = vector("list", length(x)) | |
for(name in names(x)){ | |
y = x[[name]] | |
t = tss[[name]] | |
# not all chromosomes/scaffolds have tss | |
if(is.null(t)){ | |
r[[name]] = new_empty_tss(nrow(y)) | |
next | |
} | |
if(stranded){ | |
y_plus = y$strand == "+" | |
t_plus = t$strand == "+" | |
rr = new_empty_tss(nrow(y)) | |
rr[y_plus,] = closest_tss2(y[y_plus,], t[t_plus,]) | |
rr[!y_plus,] = closest_tss2(y[!y_plus,], t[!t_plus,]) | |
r[[name]] = rr | |
} else { | |
r[[name]] = closest_tss2(y, t) | |
} | |
} | |
do.call(what = rbind, r) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment