Skip to content

Instantly share code, notes, and snippets.

@Christoph999
Created May 6, 2016 14:28
Show Gist options
  • Save Christoph999/d6436bc3dd40f8d9e169ade973353e3d to your computer and use it in GitHub Desktop.
Save Christoph999/d6436bc3dd40f8d9e169ade973353e3d to your computer and use it in GitHub Desktop.
library(data.table)
# Minimum and Maximum number of rows:
nmin <- 1e2
nmax <- 1e4 # Be careful with values >1e6. It takes several minutes then...
# Old search without data.table
find_routes <- function(routes, lat1, lng1, lat2, lng2, prec = 4L){
x1 <- lat1
y1 <- lng1
x2 <- lat2
y2 <- lng2
assertthat::assert_that(is.numeric(c(x1,y1,x2,y2)))
found_route <- routes[round(routes$lat1, prec) == round(x1, prec) &
round(routes$lng1, prec) == round(y1, prec) &
round(routes$lat2, prec) == round(x2, prec) &
round(routes$lng2, prec) == round(y2, prec), ]
return(found_route)
}
# Binary search using data.table
find_routes_env <- function(routes_env, x1, y1, x2, y2){
time <- get(paste(x1, y1, x2, y2, sep = '|'), envir = routes_env, inherits = F)
res <- data.table(lat1=x1, lng1=y1, lat2=x2, lng2=y2, time=time)
return(res)
}
# Generate testdata
generate_routes_dt <- function(nmax) {
message("generate_routes_dt...")
routes <- data.table::data.table(lat1 = numeric(nmax),
lng1 = numeric(nmax),
lat2 = numeric(nmax),
lng2 = numeric(nmax),
time = numeric(nmax))#, stringsAsFactors = F) not needed for data.table
set.seed(1)
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat1 <- tmp
set.seed(2)
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng1 <- tmp
set.seed(3)
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat2 <- tmp
set.seed(4)
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng2 <- tmp
set.seed(5)
tmp <- sample(seq(0, 1e7, length.out = nmax), nmax)
routes$time <- as.integer(tmp)
data.table::setkey(routes,NULL)
routes <- unique(routes)
routes <- routes[order(lat1, lng1, lat2, lng2, time)]
data.table::setattr(routes, "sorted", c("lat1", "lng1", "lat2", "lng2", "time"))
data.table::setkey(routes, lat1, lng1, lat2, lng2, time)
set.seed(6)
search_route <- routes[9, ]
message(paste0("search in generated data.table for ",
paste(search_route, collapse = ", ")))
return(list(routes = routes, search_route = search_route))
}
calculate_runtimes <- function(routes, n, search_route, type = "real") {
message(type)
dt_find <- numeric(length(n))
dt_bin <- numeric(length(n))
dt_env <- numeric(length(n))
for (i in 1:length(n)) {
# Search without data.table
myroutes <- routes[(1:n[i]), ]
dt_find[i] <- summary(microbenchmark::microbenchmark(
found_vec <- find_routes(myroutes,
search_route$lat1, search_route$lng1, search_route$lat2, search_route$lng2),
times = 50L, unit = "us"))$uq
print(paste0("find_routes needs: ", round(dt_find[i],2)," us to search in ", n[i], " rows."))
# Binary search
dt_bin[i] <- summary(microbenchmark::microbenchmark(
found_bin <- myroutes[J(search_route), nomatch = 0L],
times = 50L, unit = "us"))$uq
print(paste0("binary search needs: ", round(dt_bin[i],2)," us to search in ", n[i], " rows."))
if (all((found_bin != found_vec) | (found_bin != search_route))) {
message(found_bin)
message(found_vec)
}
# Create hash table from data.table
times_list <- as.list(myroutes$time)
names(times_list) <- paste(myroutes$lat1, myroutes$lng1,
myroutes$lat2, myroutes$lng2, sep='|')
routes_env <- list2env(times_list)
# Hash search in environment.
found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
search_route$lat2, search_route$lng2)
dt_env[i] <- summary(microbenchmark::microbenchmark(
found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
search_route$lat2, search_route$lng2),
times = 50L, unit = "us"))$uq
print(paste0("search in environment needs: ", round(dt_env[i],2)," us to search in ", n[i], " rows."))
if (all((found_env != found_vec) | (found_env != search_route))) {
message(found_env)
message(found_vec)
}
}
return(list(dt_find = dt_find, dt_bin = dt_bin, dt_env = dt_env))
}
# ==============================================================================
# Create search task generated data
n_test <- 10^c(log10(nmin):log10(nmax))
# Create search task with generated data.table
DT <- generate_routes_dt(nmax)
generated_routes_dt <- DT$routes
search_route_dt <- DT$search_route
dt_test <- calculate_runtimes(generated_routes_dt, n_test, search_route_dt, type = "testdata")
# ==============================================================================
# Plot Real data with fit
ymin <- nmin
ymax <- nmax
# plot test data with find routine
plot(n_test, dt_test$dt_find, col = "black", xlim = c(nmin/2,nmax), ylim = c(ymin, ymax),
xlab = "Number of rows in routes",
ylab = expression(paste("Time [", mu, "s]")),
log = "xy", yaxt = "n",
xaxt = "n", main = "runtime via microbenchmark: find_routes vs binary search")
fit_test_find <- lm(dt_test$dt_find ~ n_test^2)
abline(fit_test_find, col = "green", lty = 2, untf = T)
# plot test data with binary search routine
points(n_test, dt_test$dt_bin, col = "blue")
fit_test_bin <- lm(dt_test$dt_bin ~ n_test)
abline(fit_test_bin, col = "blue", lty = 2, untf = T)
# plot test data with search in environment
points(n_test, dt_test$dt_env, col = "red")
fit_test_env <- lm(dt_test$dt_env ~ n_test)
abline(fit_test_env, col = "red", lty = 2, untf = T)
# Labels and legend...
at.y <- outer(1:9, 10^(log10(ymin):log10(ymax)))
lab.y <- ifelse(log10(at.y) %% 1 == 0,
sapply(at.y, function(i)
as.expression(bquote(10^.(log10(i))))
), NA)
axis(2, at = at.y, labels = lab.y, las = 1)
at.x <- outer(1:9, 10^(0:log10(nmax)))
lab.x <- ifelse(log10(at.x) %% 1 == 0,
sapply(at.x, function(i)
as.expression(bquote(10^.(log10(i))))
), NA)
axis(1, at = at.x, labels = lab.x, las = 1)
grid (NULL,NULL, lty = 6, col = "cornsilk2")
legend("topleft", legend = c("Test data DT + find",
"Test data DT + binary search", "Test data in env + hash search"),
pch = c(1, 1, 1), lty = c(1, 1,1), col = c("green", "blue", "red"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment