Skip to content

Instantly share code, notes, and snippets.

@J-Moravec
Created June 6, 2025 03:47
Show Gist options
  • Save J-Moravec/a49b02c3244f1277ad75497a2868e7d7 to your computer and use it in GitHub Desktop.
Save J-Moravec/a49b02c3244f1277ad75497a2868e7d7 to your computer and use it in GitHub Desktop.
tss
which_min = function(x){
i = which.min(x)
m = x[i]
c("min" = m, "index" = i)
}
closest_tss = function(x, tss){
# x needs to be a single row
if(nrow(x) != 1)
stop("\"x\" must have exactly single row")
# assume that tss was already split
tss = tss[[x[["chr"]]]]
# if NULL, return an empty data frame
if(is.null(tss))
return(new_closest_tss())
# consider only those with same sign
tss = tss[tss$strand == x[["strand"]],]
if(nrow(tss) == 0)
return(new_closest_tss())
# surely there is a better way to do this?
min = list(
"start" = which_min(abs(x[["start"]] - tss$start)),
"end" = which_min(abs(x[["end"]] - tss$start))
)
min = min[[which.min(c(min$start["min"], min$end["min"]))]]
tss = tss[min[["index"]],]
new_closest_tss(min[["min"]], tss$start, tss$ID)
}
find_closest_tss = function(x, tss){
x$start = as.integer(x$start)
x$end = as.integer(x$end)
tss$start = as.integer(tss$start)
# split TSS to make subsetting for chromosomes easier
tss = split(tss, tss$chr)
# Processing in chunks to make it a bit easier
y = split(x, x$chr)
r = vector("list", length(y))
names(r) = names(y)
for(i in seq_along(y)){
yy = split(y[[i]], 1:nrow(y[[i]])) # super inefficient but simple
rr = lapply(yy, closest_tss, tss)
rr = do.call(what = rbind, rr)
r[[i]] = rr
}
do.call(what = rbind, r)
}
# Finds the shortest of elements of x to elements of y.
# Both x and y must be sorted
sort_dist_c = inline::cfunction(
sig = signature("x" = "SEXP", "y" = "SEXP"),
language = "C",
convention = ".Call",
body = '
size_t lenx = XLENGTH(x);
size_t leny = XLENGTH(y);
if(!Rf_isInteger(x) || !Rf_isInteger(y)){
Rf_error("\\"x\\" and \\"y\\" must be of the type \\"integer\\"");
}
SEXP id = PROTECT(Rf_allocVector(INTSXP, lenx));
SEXP distance = PROTECT(Rf_allocVector(INTSXP, lenx));
size_t i_y = 0;
int d = 0;
int d_new = 0;
for(size_t i_x = 0; i_x < lenx; i_x++){
d = INTEGER(y)[i_y] - INTEGER(x)[i_x];
while(i_y < (leny - 1)){
d_new = INTEGER(y)[i_y + 1] - INTEGER(x)[i_x];
if(abs(d) < abs(d_new)){
// Current is smaller, all other will only be further away
break;
}
d = d_new;
i_y++;
}
INTEGER(id)[i_x] = i_y + 1; // turn to 1-index
INTEGER(distance)[i_x] = d;
}
SEXP res = PROTECT(Rf_allocVector(VECSXP, 2));
SET_VECTOR_ELT(res, 0, id);
SET_VECTOR_ELT(res, 1, distance);
// define names and assign them to res
SEXP names = PROTECT(Rf_allocVector(STRSXP, 2));
SET_STRING_ELT(names, 0, Rf_mkChar("index"));
SET_STRING_ELT(names, 1, Rf_mkChar("distance"));
Rf_setAttrib(res, R_NamesSymbol, names);
UNPROTECT(4);
return res;
'
)
new_closest_tss = function(distance, position, id){
data.frame(
"tss_distance" = distance,
"tss_position" = position,
"tss_id" = id
)
}
new_empty_tss = function(n){
data.frame(
"tss_distance" = rep(NA, n),
"tss_position" = NA,
"tss_id" = NA
)
}
# this is for a single chromosome
closest_tss2 = function(x, tss){
# for sort_dist_c both x and tss need sorted
# use order(order(x)) to restore previous order after sorting
# reorder tss according to start
tss$start = as.integer(tss$start)
tss = tss[order(tss$start),]
# sort start, calculate distace, and restore original order
x$start = as.integer(x$start)
o = order(x$start)
oo = order(o) # original order
start_dist = sort_dist_c(x$start[o], tss$start)
start_dist$index[oo]
start_dist$distance[oo]
# sort end, calculate distance, and restore original order
x$end = as.integer(x$end)
o = order(x$end)
end_dist = sort_dist_c(x$end[o], tss$start)
end_dist$index[oo]
end_dist$distance[oo]
# calculate which which of start and end dist is smaller
select = abs(start_dist$distance) < abs(end_dist$distance)
index = ifelse(select, start_dist$index, end_dist$index)
distance = ifelse(select, start_dist$distance, end_dist$distance)
new_closest_tss(
abs(distance),
tss$start[index],
tss$ID[index]
)
}
find_closest_tss2 = function(x, tss, stranded = TRUE){
x$start = as.integer(x$start)
x$end = as.integer(x$end)
tss$start = as.integer(tss$start)
# split tss and x by chromosome
tss = split(tss, tss$chr)
x = split(x, x$chr)
r = vector("list", length(x))
for(name in names(x)){
y = x[[name]]
t = tss[[name]]
# not all chromosomes/scaffolds have tss
if(is.null(t)){
r[[name]] = new_empty_tss(nrow(y))
next
}
if(stranded){
y_plus = y$strand == "+"
t_plus = t$strand == "+"
rr = new_empty_tss(nrow(y))
rr[y_plus,] = closest_tss2(y[y_plus,], t[t_plus,])
rr[!y_plus,] = closest_tss2(y[!y_plus,], t[!t_plus,])
r[[name]] = rr
} else {
r[[name]] = closest_tss2(y, t)
}
}
do.call(what = rbind, r)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment