Christoph999 · May 6, 2016 14:28
diff --git a/DT vs hashed env b/DT vs hashed env
 library(data.table)
 # Minimum and Maximum number of rows:
 nmin <- 1e2
 nmax <- 1e4 # Be careful with values >1e6. It takes several minutes then...

 # Old search without data.table
 find_routes <- function(routes, lat1, lng1, lat2, lng2, prec = 4L){
  x1 <- lat1
  y1 <- lng1
  x2 <- lat2
  y2 <- lng2
  assertthat::assert_that(is.numeric(c(x1,y1,x2,y2)))
  found_route <- routes[round(routes$lat1, prec) == round(x1, prec) &
      round(routes$lng1, prec) == round(y1, prec) &
      round(routes$lat2, prec) == round(x2, prec) &
      round(routes$lng2, prec) == round(y2, prec), ]
  return(found_route)
 }

 # Binary search using data.table
 find_routes_env <- function(routes_env, x1, y1, x2, y2){
  time <- get(paste(x1, y1, x2, y2, sep = '|'), envir = routes_env, inherits = F)
  res <- data.table(lat1=x1, lng1=y1, lat2=x2, lng2=y2, time=time)
  return(res)
 }

 # Generate testdata
 generate_routes_dt <- function(nmax) {
  message("generate_routes_dt...")
  routes <- data.table::data.table(lat1 = numeric(nmax),
    lng1 = numeric(nmax),
    lat2 = numeric(nmax),
    lng2 = numeric(nmax),
    time = numeric(nmax))#, stringsAsFactors = F) not needed for data.table
  set.seed(1)
  tmp <- sample(seq(46, 49, length.out = nmax), nmax)
  routes$lat1 <- tmp
  set.seed(2)
  tmp <- sample(seq(8, 10, length.out = nmax), nmax)
  routes$lng1 <- tmp
  set.seed(3)
  tmp <- sample(seq(46, 49, length.out = nmax), nmax)
  routes$lat2 <- tmp
  set.seed(4)
  tmp <- sample(seq(8, 10, length.out = nmax), nmax)
  routes$lng2 <- tmp
  set.seed(5)
  tmp <- sample(seq(0, 1e7, length.out = nmax), nmax)
  routes$time <- as.integer(tmp)
  data.table::setkey(routes,NULL)
  routes <- unique(routes)
  routes <- routes[order(lat1, lng1, lat2, lng2, time)]
  data.table::setattr(routes, "sorted", c("lat1", "lng1", "lat2", "lng2", "time"))
  data.table::setkey(routes, lat1, lng1, lat2, lng2, time)
  set.seed(6)
  search_route <- routes[9, ]
  message(paste0("search in generated data.table for ",
    paste(search_route, collapse = ", ")))
  return(list(routes = routes, search_route = search_route))
 }

 calculate_runtimes <- function(routes, n, search_route, type = "real") {
  message(type)
  dt_find <- numeric(length(n))
  dt_bin <- numeric(length(n))
  dt_env <- numeric(length(n))
  for (i in 1:length(n)) {
    # Search without data.table
    myroutes <- routes[(1:n[i]), ]
    dt_find[i] <- summary(microbenchmark::microbenchmark(
      found_vec <- find_routes(myroutes,
        search_route$lat1, search_route$lng1, search_route$lat2, search_route$lng2),
      times = 50L, unit = "us"))$uq
    print(paste0("find_routes needs: ", round(dt_find[i],2)," us to search in ", n[i], " rows."))

    # Binary search
    dt_bin[i] <- summary(microbenchmark::microbenchmark(
      found_bin <- myroutes[J(search_route), nomatch = 0L],
      times = 50L, unit = "us"))$uq
    print(paste0("binary search needs: ", round(dt_bin[i],2)," us to search in ", n[i], " rows."))
    if (all((found_bin != found_vec) | (found_bin != search_route))) {
      message(found_bin)
      message(found_vec)
    }

    # Create hash table from data.table
    times_list <- as.list(myroutes$time)
    names(times_list) <- paste(myroutes$lat1, myroutes$lng1,
      myroutes$lat2, myroutes$lng2, sep='|')
    routes_env <- list2env(times_list)
    # Hash search in environment.
    found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
      search_route$lat2, search_route$lng2)
    dt_env[i] <- summary(microbenchmark::microbenchmark(
      found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
        search_route$lat2, search_route$lng2),
      times = 50L, unit = "us"))$uq
    print(paste0("search in environment needs: ", round(dt_env[i],2)," us to search in ", n[i], " rows."))
    if (all((found_env != found_vec) | (found_env != search_route))) {
      message(found_env)
      message(found_vec)
    }
  }
  return(list(dt_find = dt_find, dt_bin = dt_bin, dt_env = dt_env))
 }


 # ==============================================================================
 # Create search task generated data
 n_test <- 10^c(log10(nmin):log10(nmax))
 # Create search task with generated data.table
 DT <- generate_routes_dt(nmax)
 generated_routes_dt <- DT$routes
 search_route_dt <- DT$search_route
 dt_test <- calculate_runtimes(generated_routes_dt, n_test, search_route_dt, type = "testdata")


 # ==============================================================================
 # Plot Real data with fit
 ymin <- nmin
 ymax <- nmax
 # plot test data with find routine
 plot(n_test, dt_test$dt_find, col = "black", xlim = c(nmin/2,nmax), ylim = c(ymin, ymax),
  xlab = "Number of rows in routes",
  ylab = expression(paste("Time [", mu, "s]")),
  log = "xy", yaxt = "n",
  xaxt = "n", main = "runtime via microbenchmark: find_routes vs binary search")
 fit_test_find <- lm(dt_test$dt_find ~ n_test^2)
 abline(fit_test_find, col = "green", lty = 2, untf = T)

 # plot test data with binary search routine
 points(n_test, dt_test$dt_bin, col = "blue")
 fit_test_bin <- lm(dt_test$dt_bin ~ n_test)
 abline(fit_test_bin, col = "blue", lty = 2, untf = T)

 # plot test data with search in environment
 points(n_test, dt_test$dt_env, col = "red")
 fit_test_env <- lm(dt_test$dt_env ~ n_test)
 abline(fit_test_env, col = "red", lty = 2, untf = T)

 # Labels and legend...
 at.y <- outer(1:9, 10^(log10(ymin):log10(ymax)))
 lab.y <- ifelse(log10(at.y) %% 1 == 0,
  sapply(at.y, function(i)
    as.expression(bquote(10^.(log10(i))))
  ), NA)
 axis(2, at = at.y, labels = lab.y, las = 1)

 at.x <- outer(1:9, 10^(0:log10(nmax)))
 lab.x <- ifelse(log10(at.x) %% 1 == 0,
  sapply(at.x, function(i)
    as.expression(bquote(10^.(log10(i))))
  ), NA)
 axis(1, at = at.x, labels = lab.x, las = 1)
 grid (NULL,NULL, lty = 6, col = "cornsilk2")
 legend("topleft", legend = c("Test data DT + find",
  "Test data DT + binary search", "Test data in env + hash search"),
  pch = c(1, 1, 1), lty = c(1, 1,1), col = c("green", "blue", "red"))
	library(data.table)
	# Minimum and Maximum number of rows:
	nmin <- 1e2
	nmax <- 1e4 # Be careful with values >1e6. It takes several minutes then...

	# Old search without data.table
	find_routes <- function(routes, lat1, lng1, lat2, lng2, prec = 4L){
	x1 <- lat1
	y1 <- lng1
	x2 <- lat2
	y2 <- lng2
	assertthat::assert_that(is.numeric(c(x1,y1,x2,y2)))
	found_route <- routes[round(routes$lat1, prec) == round(x1, prec) &
	round(routes$lng1, prec) == round(y1, prec) &
	round(routes$lat2, prec) == round(x2, prec) &
	round(routes$lng2, prec) == round(y2, prec), ]
	return(found_route)
	}

	# Binary search using data.table
	find_routes_env <- function(routes_env, x1, y1, x2, y2){
	time <- get(paste(x1, y1, x2, y2, sep = '\|'), envir = routes_env, inherits = F)
	res <- data.table(lat1=x1, lng1=y1, lat2=x2, lng2=y2, time=time)
	return(res)
	}

	# Generate testdata
	generate_routes_dt <- function(nmax) {
	message("generate_routes_dt...")
	routes <- data.table::data.table(lat1 = numeric(nmax),
	lng1 = numeric(nmax),
	lat2 = numeric(nmax),
	lng2 = numeric(nmax),
	time = numeric(nmax))#, stringsAsFactors = F) not needed for data.table
	set.seed(1)
	tmp <- sample(seq(46, 49, length.out = nmax), nmax)
	routes$lat1 <- tmp
	set.seed(2)
	tmp <- sample(seq(8, 10, length.out = nmax), nmax)
	routes$lng1 <- tmp
	set.seed(3)
	tmp <- sample(seq(46, 49, length.out = nmax), nmax)
	routes$lat2 <- tmp
	set.seed(4)
	tmp <- sample(seq(8, 10, length.out = nmax), nmax)
	routes$lng2 <- tmp
	set.seed(5)
	tmp <- sample(seq(0, 1e7, length.out = nmax), nmax)
	routes$time <- as.integer(tmp)
	data.table::setkey(routes,NULL)
	routes <- unique(routes)
	routes <- routes[order(lat1, lng1, lat2, lng2, time)]
	data.table::setattr(routes, "sorted", c("lat1", "lng1", "lat2", "lng2", "time"))
	data.table::setkey(routes, lat1, lng1, lat2, lng2, time)
	set.seed(6)
	search_route <- routes[9, ]
	message(paste0("search in generated data.table for ",
	paste(search_route, collapse = ", ")))
	return(list(routes = routes, search_route = search_route))
	}

	calculate_runtimes <- function(routes, n, search_route, type = "real") {
	message(type)
	dt_find <- numeric(length(n))
	dt_bin <- numeric(length(n))
	dt_env <- numeric(length(n))
	for (i in 1:length(n)) {
	# Search without data.table
	myroutes <- routes[(1:n[i]), ]
	dt_find[i] <- summary(microbenchmark::microbenchmark(
	found_vec <- find_routes(myroutes,
	search_route$lat1, search_route$lng1, search_route$lat2, search_route$lng2),
	times = 50L, unit = "us"))$uq
	print(paste0("find_routes needs: ", round(dt_find[i],2)," us to search in ", n[i], " rows."))

	# Binary search
	dt_bin[i] <- summary(microbenchmark::microbenchmark(
	found_bin <- myroutes[J(search_route), nomatch = 0L],
	times = 50L, unit = "us"))$uq
	print(paste0("binary search needs: ", round(dt_bin[i],2)," us to search in ", n[i], " rows."))
	if (all((found_bin != found_vec) \| (found_bin != search_route))) {
	message(found_bin)
	message(found_vec)
	}

	# Create hash table from data.table
	times_list <- as.list(myroutes$time)
	names(times_list) <- paste(myroutes$lat1, myroutes$lng1,
	myroutes$lat2, myroutes$lng2, sep='\|')
	routes_env <- list2env(times_list)
	# Hash search in environment.
	found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
	search_route$lat2, search_route$lng2)
	dt_env[i] <- summary(microbenchmark::microbenchmark(
	found_env <- find_routes_env(routes_env, search_route$lat1, search_route$lng1,
	search_route$lat2, search_route$lng2),
	times = 50L, unit = "us"))$uq
	print(paste0("search in environment needs: ", round(dt_env[i],2)," us to search in ", n[i], " rows."))
	if (all((found_env != found_vec) \| (found_env != search_route))) {
	message(found_env)
	message(found_vec)
	}
	}
	return(list(dt_find = dt_find, dt_bin = dt_bin, dt_env = dt_env))
	}


	# ==============================================================================
	# Create search task generated data
	n_test <- 10^c(log10(nmin):log10(nmax))
	# Create search task with generated data.table
	DT <- generate_routes_dt(nmax)
	generated_routes_dt <- DT$routes
	search_route_dt <- DT$search_route
	dt_test <- calculate_runtimes(generated_routes_dt, n_test, search_route_dt, type = "testdata")


	# ==============================================================================
	# Plot Real data with fit
	ymin <- nmin
	ymax <- nmax
	# plot test data with find routine
	plot(n_test, dt_test$dt_find, col = "black", xlim = c(nmin/2,nmax), ylim = c(ymin, ymax),
	xlab = "Number of rows in routes",
	ylab = expression(paste("Time [", mu, "s]")),
	log = "xy", yaxt = "n",
	xaxt = "n", main = "runtime via microbenchmark: find_routes vs binary search")
	fit_test_find <- lm(dt_test$dt_find ~ n_test^2)
	abline(fit_test_find, col = "green", lty = 2, untf = T)

	# plot test data with binary search routine
	points(n_test, dt_test$dt_bin, col = "blue")
	fit_test_bin <- lm(dt_test$dt_bin ~ n_test)
	abline(fit_test_bin, col = "blue", lty = 2, untf = T)

	# plot test data with search in environment
	points(n_test, dt_test$dt_env, col = "red")
	fit_test_env <- lm(dt_test$dt_env ~ n_test)
	abline(fit_test_env, col = "red", lty = 2, untf = T)

	# Labels and legend...
	at.y <- outer(1:9, 10^(log10(ymin):log10(ymax)))
	lab.y <- ifelse(log10(at.y) %% 1 == 0,
	sapply(at.y, function(i)
	as.expression(bquote(10^.(log10(i))))
	), NA)
	axis(2, at = at.y, labels = lab.y, las = 1)

	at.x <- outer(1:9, 10^(0:log10(nmax)))
	lab.x <- ifelse(log10(at.x) %% 1 == 0,
	sapply(at.x, function(i)
	as.expression(bquote(10^.(log10(i))))
	), NA)
	axis(1, at = at.x, labels = lab.x, las = 1)
	grid (NULL,NULL, lty = 6, col = "cornsilk2")
	legend("topleft", legend = c("Test data DT + find",
	"Test data DT + binary search", "Test data in env + hash search"),
	pch = c(1, 1, 1), lty = c(1, 1,1), col = c("green", "blue", "red"))