J-Moravec · June 6, 2025 03:47
diff --git a/tss.r b/tss.r

 which_min = function(x){
    i = which.min(x)
    m = x[i]
    c("min" = m, "index" = i)
    }


 closest_tss = function(x, tss){
    # x needs to be a single row
    if(nrow(x) != 1)
        stop("\"x\" must have exactly single row")

    # assume that tss was already split
    tss = tss[[x[["chr"]]]]
    # if NULL, return an empty data frame
    if(is.null(tss))
        return(new_closest_tss())

    # consider only those with same sign
    tss = tss[tss$strand == x[["strand"]],]
    if(nrow(tss) == 0)
        return(new_closest_tss())

    # surely there is a better way to do this? 
    min = list(
        "start" = which_min(abs(x[["start"]] - tss$start)),
        "end" = which_min(abs(x[["end"]] - tss$start))
        )
    min = min[[which.min(c(min$start["min"], min$end["min"]))]]
    tss = tss[min[["index"]],]

    new_closest_tss(min[["min"]], tss$start, tss$ID)
    }


 find_closest_tss = function(x, tss){
    x$start = as.integer(x$start)
    x$end = as.integer(x$end)
    tss$start = as.integer(tss$start)

    # split TSS to make subsetting for chromosomes easier
    tss = split(tss, tss$chr)

    # Processing in chunks to make it a bit easier
    y = split(x, x$chr)
    r = vector("list", length(y))
    names(r) = names(y)
    for(i in seq_along(y)){
        yy = split(y[[i]], 1:nrow(y[[i]])) # super inefficient but simple
        rr = lapply(yy, closest_tss, tss)
        rr = do.call(what = rbind, rr)
        r[[i]] = rr
        }
    do.call(what = rbind, r)
    }


 # Finds the shortest of elements of x to elements of y.
 # Both x and y must be sorted
 sort_dist_c = inline::cfunction(
    sig = signature("x" = "SEXP", "y" = "SEXP"),
    language = "C",
    convention = ".Call",
    body = '
    size_t lenx = XLENGTH(x);
    size_t leny = XLENGTH(y);

    if(!Rf_isInteger(x) || !Rf_isInteger(y)){
        Rf_error("\\"x\\" and \\"y\\" must be of the type \\"integer\\"");
        }


    SEXP id = PROTECT(Rf_allocVector(INTSXP, lenx));
    SEXP distance = PROTECT(Rf_allocVector(INTSXP, lenx));

    size_t i_y = 0;
    int d = 0;
    int d_new = 0;
    for(size_t i_x = 0; i_x < lenx; i_x++){
        d = INTEGER(y)[i_y] - INTEGER(x)[i_x];
        while(i_y < (leny - 1)){
            d_new = INTEGER(y)[i_y + 1] - INTEGER(x)[i_x];
            if(abs(d) < abs(d_new)){
                // Current is smaller, all other will only be further away
                break;
                }
            d = d_new;
            i_y++;
            }
        INTEGER(id)[i_x] = i_y + 1; // turn to 1-index
        INTEGER(distance)[i_x] = d;
        }

    SEXP res = PROTECT(Rf_allocVector(VECSXP, 2));
    SET_VECTOR_ELT(res, 0, id);
    SET_VECTOR_ELT(res, 1, distance);

    // define names and assign them to res
    SEXP names = PROTECT(Rf_allocVector(STRSXP, 2));
    SET_STRING_ELT(names, 0, Rf_mkChar("index"));
    SET_STRING_ELT(names, 1, Rf_mkChar("distance"));
    Rf_setAttrib(res, R_NamesSymbol, names);
    UNPROTECT(4);
    return res;
    '
    )



 new_closest_tss = function(distance, position, id){
    data.frame(
        "tss_distance" = distance,
        "tss_position" = position,
        "tss_id" = id
        )
    }


 new_empty_tss = function(n){
    data.frame(
        "tss_distance" = rep(NA, n),
        "tss_position" = NA,
        "tss_id" = NA
        )
    }



 # this is for a single chromosome
 closest_tss2 = function(x, tss){
    # for sort_dist_c both x and tss need sorted
    # use order(order(x)) to restore previous order after sorting

    # reorder tss according to start
    tss$start = as.integer(tss$start)
    tss = tss[order(tss$start),]

    # sort start, calculate distace, and restore original order
    x$start = as.integer(x$start)
    o = order(x$start)
    oo = order(o) # original order
    start_dist = sort_dist_c(x$start[o], tss$start)
    start_dist$index[oo]
    start_dist$distance[oo]

    # sort end, calculate distance, and restore original order
    x$end = as.integer(x$end)
    o = order(x$end)
    end_dist = sort_dist_c(x$end[o], tss$start)
    end_dist$index[oo]
    end_dist$distance[oo]

    # calculate which which of start and end dist is smaller
    select = abs(start_dist$distance) < abs(end_dist$distance)

    index = ifelse(select, start_dist$index, end_dist$index)
    distance = ifelse(select, start_dist$distance, end_dist$distance)

    new_closest_tss(
        abs(distance),
        tss$start[index],
        tss$ID[index]
        )
    }

 find_closest_tss2 = function(x, tss, stranded = TRUE){
    x$start = as.integer(x$start)
    x$end = as.integer(x$end)
    tss$start = as.integer(tss$start)

    # split tss and x by chromosome
    tss = split(tss, tss$chr)
    x = split(x, x$chr)

    r = vector("list", length(x))
    for(name in names(x)){
        y = x[[name]]
        t = tss[[name]]

        # not all chromosomes/scaffolds have tss
        if(is.null(t)){
            r[[name]] = new_empty_tss(nrow(y))
            next
            }

        if(stranded){
            y_plus = y$strand == "+"
            t_plus = t$strand == "+"

            rr = new_empty_tss(nrow(y))
            rr[y_plus,] = closest_tss2(y[y_plus,], t[t_plus,])
            rr[!y_plus,] = closest_tss2(y[!y_plus,], t[!t_plus,])
            r[[name]] = rr
            } else {
            r[[name]] = closest_tss2(y, t)
            }
        }
    do.call(what = rbind, r)
    }

	which_min = function(x){
	i = which.min(x)
	m = x[i]
	c("min" = m, "index" = i)
	}


	closest_tss = function(x, tss){
	# x needs to be a single row
	if(nrow(x) != 1)
	stop("\"x\" must have exactly single row")

	# assume that tss was already split
	tss = tss[[x[["chr"]]]]
	# if NULL, return an empty data frame
	if(is.null(tss))
	return(new_closest_tss())

	# consider only those with same sign
	tss = tss[tss$strand == x[["strand"]],]
	if(nrow(tss) == 0)
	return(new_closest_tss())

	# surely there is a better way to do this?
	min = list(
	"start" = which_min(abs(x[["start"]] - tss$start)),
	"end" = which_min(abs(x[["end"]] - tss$start))
	)
	min = min[[which.min(c(min$start["min"], min$end["min"]))]]
	tss = tss[min[["index"]],]

	new_closest_tss(min[["min"]], tss$start, tss$ID)
	}


	find_closest_tss = function(x, tss){
	x$start = as.integer(x$start)
	x$end = as.integer(x$end)
	tss$start = as.integer(tss$start)

	# split TSS to make subsetting for chromosomes easier
	tss = split(tss, tss$chr)

	# Processing in chunks to make it a bit easier
	y = split(x, x$chr)
	r = vector("list", length(y))
	names(r) = names(y)
	for(i in seq_along(y)){
	yy = split(y[[i]], 1:nrow(y[[i]])) # super inefficient but simple
	rr = lapply(yy, closest_tss, tss)
	rr = do.call(what = rbind, rr)
	r[[i]] = rr
	}
	do.call(what = rbind, r)
	}


	# Finds the shortest of elements of x to elements of y.
	# Both x and y must be sorted
	sort_dist_c = inline::cfunction(
	sig = signature("x" = "SEXP", "y" = "SEXP"),
	language = "C",
	convention = ".Call",
	body = '
	size_t lenx = XLENGTH(x);
	size_t leny = XLENGTH(y);

	if(!Rf_isInteger(x) \|\| !Rf_isInteger(y)){
	Rf_error("\\"x\\" and \\"y\\" must be of the type \\"integer\\"");
	}


	SEXP id = PROTECT(Rf_allocVector(INTSXP, lenx));
	SEXP distance = PROTECT(Rf_allocVector(INTSXP, lenx));

	size_t i_y = 0;
	int d = 0;
	int d_new = 0;
	for(size_t i_x = 0; i_x < lenx; i_x++){
	d = INTEGER(y)[i_y] - INTEGER(x)[i_x];
	while(i_y < (leny - 1)){
	d_new = INTEGER(y)[i_y + 1] - INTEGER(x)[i_x];
	if(abs(d) < abs(d_new)){
	// Current is smaller, all other will only be further away
	break;
	}
	d = d_new;
	i_y++;
	}
	INTEGER(id)[i_x] = i_y + 1; // turn to 1-index
	INTEGER(distance)[i_x] = d;
	}

	SEXP res = PROTECT(Rf_allocVector(VECSXP, 2));
	SET_VECTOR_ELT(res, 0, id);
	SET_VECTOR_ELT(res, 1, distance);

	// define names and assign them to res
	SEXP names = PROTECT(Rf_allocVector(STRSXP, 2));
	SET_STRING_ELT(names, 0, Rf_mkChar("index"));
	SET_STRING_ELT(names, 1, Rf_mkChar("distance"));
	Rf_setAttrib(res, R_NamesSymbol, names);
	UNPROTECT(4);
	return res;
	'
	)



	new_closest_tss = function(distance, position, id){
	data.frame(
	"tss_distance" = distance,
	"tss_position" = position,
	"tss_id" = id
	)
	}


	new_empty_tss = function(n){
	data.frame(
	"tss_distance" = rep(NA, n),
	"tss_position" = NA,
	"tss_id" = NA
	)
	}



	# this is for a single chromosome
	closest_tss2 = function(x, tss){
	# for sort_dist_c both x and tss need sorted
	# use order(order(x)) to restore previous order after sorting

	# reorder tss according to start
	tss$start = as.integer(tss$start)
	tss = tss[order(tss$start),]

	# sort start, calculate distace, and restore original order
	x$start = as.integer(x$start)
	o = order(x$start)
	oo = order(o) # original order
	start_dist = sort_dist_c(x$start[o], tss$start)
	start_dist$index[oo]
	start_dist$distance[oo]

	# sort end, calculate distance, and restore original order
	x$end = as.integer(x$end)
	o = order(x$end)
	end_dist = sort_dist_c(x$end[o], tss$start)
	end_dist$index[oo]
	end_dist$distance[oo]

	# calculate which which of start and end dist is smaller
	select = abs(start_dist$distance) < abs(end_dist$distance)

	index = ifelse(select, start_dist$index, end_dist$index)
	distance = ifelse(select, start_dist$distance, end_dist$distance)

	new_closest_tss(
	abs(distance),
	tss$start[index],
	tss$ID[index]
	)
	}

	find_closest_tss2 = function(x, tss, stranded = TRUE){
	x$start = as.integer(x$start)
	x$end = as.integer(x$end)
	tss$start = as.integer(tss$start)

	# split tss and x by chromosome
	tss = split(tss, tss$chr)
	x = split(x, x$chr)

	r = vector("list", length(x))
	for(name in names(x)){
	y = x[[name]]
	t = tss[[name]]

	# not all chromosomes/scaffolds have tss
	if(is.null(t)){
	r[[name]] = new_empty_tss(nrow(y))
	next
	}

	if(stranded){
	y_plus = y$strand == "+"
	t_plus = t$strand == "+"

	rr = new_empty_tss(nrow(y))
	rr[y_plus,] = closest_tss2(y[y_plus,], t[t_plus,])
	rr[!y_plus,] = closest_tss2(y[!y_plus,], t[!t_plus,])
	r[[name]] = rr
	} else {
	r[[name]] = closest_tss2(y, t)
	}
	}
	do.call(what = rbind, r)
	}