raymondben · August 29, 2015 14:07
diff --git a/eucalypt_height.R b/eucalypt_height.R
 ## see http://untan.gl/articles/2014/10/08_using-ala4r-mapping-eucalypt-height.html

 ## load the ALA4R package
 library(ALA4R)
 ## check that it's version 1.04 or later: the "guid" attribute on the sites_by_species data was added in that version
 if (as.numeric(packageDescription("ALA4R",fields="Version"))<1.04)
    stop("reinstall ALA4R to get v1.04 or later")

 ala_config(cache_directory=file.path("~","data","ala_cache"))

 ## some other packages that we'll use
 library(stringr)
 library(plyr)
 library(ggplot2)

 ## define the text searches (regular expressions) that we use to extract height information from a species profile
 htre=data.frame(re=c('(growing to|grows to|heights of up to)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)'),col_index=c(3),type=c("maximum"),stringsAsFactors=FALSE)
 htre=rbind(htre,list('(tree|shrub)[[:punct:][:space:]]+(up to|to)[[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)',4,"maximum"))
 htre=rbind(htre,list('(tree|trees|shrub|shrubs|grows|grow|growing)[[:space:]]+to[[:space:]]+(roughly|about|around|over)[[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)',4,"maximum")) ## text like "tree to about 15 metres, grows to about 10m"
 htre=rbind(htre,list('(tree|trees|shrub|shrubs)[[:space:]]+(about|around|roughly|over)[[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)[[:space:]]+(high|tall|in height)',4,"maximum")) ## text like "tree about 4 m high"
 htre=rbind(htre,list('(to|may reach|can reach|up to|reaching|shrub|tree)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)[[:space:]]+(high|tall|in height)',3,"maximum"))
 htre=rbind(htre,list('(to|may reach|can reach|up to|reaching|shrub|tree)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)[[:space:]]+or more (high|tall|in height)',3,"maximum")) ## e.g. "reaching 80m or more in height"
 htre=rbind(htre,list('(to|may reach|can reach|up to|reaching|growing to|grows to)[[:space:]]+a[[:space:]]+height[[:space:]]+of[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)',3,"maximum"))
 htre=rbind(htre,list('(to|may reach|can reach|up to|reaching|growing to|grows to)[[:space:]]+a[[:space:]]+maximum[[:space:]]+height[[:space:]]+of[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)',3,"maximum"))
 htre=rbind(htre,list('(to|may reach|can reach|up to|reaching|growing to|shrub|shrubs|tree|trees|typically|generally|usually|commonly|about)[[:space:][:punct:]]+[\\.[:digit:]]+[[:space:]]*[–-][[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)[[:space:]]+(high|tall|in height)',3,"maximum")) 
 htre=rbind(htre,list('(to|may reach|can reach|reaches|up to|reaching|growing to|shrub|shrubs|tree|trees)[[:space:][:punct:]]+(typically|generally|usually|commonly|about|of about)?[[:space:]]*[\\.[:digit:]]+[[:space:]]*[–-][[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)[[:space:]]+(high|tall|in height)',4,"maximum")) ## e.g. "shrub, typically 2-3 metres high", "tree of about 6-9 metres in height", "reaches about 5-7 metres in height"
 htre=rbind(htre,list('(height|shrub|tree|shrubs|trees)[[:space:][:punct:]]+(can range|ranging|mostly seen)[[:space:]]+between[[:space:]]*[\\.[:digit:]]+[[:space:]]*(–|and|-)[[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)',5,"maximum")) ## "height can range between 2 and 25 metres", "tree ranging between 10 and 20 metres"
 htre=rbind(htre,list('(grows to|grow to|growing to|grow)[[:space:]]+between[[:space:]]*[\\.[:digit:]]+[[:space:]]*[(m|metres)]?[[:space:]]*(–|and|-)[[:space:]]*([\\.[:digit:]]+)[[:space:]]*(m|metres)',4,"maximum"))
 htre=rbind(htre,list('(occasionally reaching)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m|metres)',3,"exceptional maximum")) ## e.g. "occasionally reaching 50 m"

 ## a function to apply those regular expressions
 extract_heights=function(x){
    x=x$simpleProperties$value
    x=str_replace_all(x,"\\([[:digit:][:space:]–-]+(in|inches|ft|feet)\\)","") ## some descriptions give heights in metres and then repeat in imperial units - remove these
    out=data.frame()
    processed_x=x
    for (k in 1:nrow(htre)) { ## iterate through each regular expression in turn
        this_match=str_match_all(processed_x,ignore.case(htre[k,]$re))
        this_match=ldply(this_match,function(z)if (is.null(nrow(z))) NULL else data.frame(type=htre[k,]$type,matched=z[,1],value=as.numeric(z[,htre[k,]$col_index])))
        if (nrow(this_match)>0) {
            out=rbind(out,this_match)
        }
        processed_x=str_replace_all(processed_x,ignore.case(htre[k,]$re),"") ## remove matches as we go, so that we don't match multiple times on the same text
    }
    out
 }


 ## download our sites-by-species matrix, using a 0.5-degree grid cell size
 ss=sites_by_species("genus:Eucalyptus",wkt="POLYGON((110 -45,155 -45,155 -10,110 -10,110 -45))",gridsize=0.5)
 ## convert coordinates to cell centres
 ss$longitude=ss$longitude+0.25
 ss$latitude=ss$latitude+0.25

 all_guids=attr(ss,"guid") ## the identifier of each eucalypt taxon in the ss dataframe

 guid_heights=rep(NA,length(all_guids))
 for (k in 1:length(all_guids)) {
    ## iterate through each identifier and extract the mean tree height
    x=species_info(guid=all_guids[k])
    temp=extract_heights(x)
    guid_heights[k]=mean(temp$value[temp$type=="maximum"])
 }

 ## now, for each column in our sites-by-species matrix, calculate the average height of the species present, weighted by their prevalences
 cellheight=apply(ss[,3:ncol(ss)],1,function(z)sum(z*guid_heights,na.rm=TRUE)/sum(z,na.rm=TRUE))
 ## there are a few cells that lie over the ocean (errors in record location) --- almost all of these have only a single species, so we'll remove any cell with only one species (this also removes some cells over land)
 celln=rowSums(ss[,3:ncol(ss)]>0)
 cellheight[celln<2]=NA

 ## plot the result
 (ggplot(data.frame(lon=ss$longitude,lat=ss$latitude,height=cellheight),aes(x=lon,y=lat))+geom_tile(mapping=aes(fill=height))+scale_fill_gradientn(limit=c(0,50),na.value="transparent",colours=c("#709959","#A3C77D","#D6ED9A","#F0E48D","#E6BA85","#C9957F","#D6AD94","#F5E1C4","#FAF1E3")))


 ## Part 2: exploring the patterns

 ## pick some environmental layers. See ala_fields("layers") for a full list
 env_layers=c("Precipitation - annual","Radiation - annual mean (Bio20)","Elevation ","Temperature - annual max mean","Wind speed - annual mean 9am or 3pm")
 working_data=na.omit(xh)
 ## get the value of each of those environmental layers at each of our sample points
 ep=intersect_points(working_data[,c("lat","lon")],env_layers)
 working_data=na.omit(cbind(working_data,ep))

 ## fit a random forest regression model
 library(randomForest)
 fit=randomForest(height~precipitationAnnual+radiationAnnualMeanBio20+elevation+temperatureAnnualMaxMean+windSpeedAnnualMean9amOr3pm,data=working_data,importance=TRUE)

 ## plot the partial effects
 imp=importance(fit)
 op=par(mfrow=c(3,2))
 for (i in sort(imp[,1],index.return=TRUE,decreasing=TRUE)$ix) {
    thisvar=rownames(imp)[i]
    partialPlot(fit,working_data,names(working_data)[names(working_data)==thisvar],xlab=sprintf("%s (importance %.1f)",thisvar,imp[i,1]),main="")
 }
 par(op)

 ## use the model to predict across Australia - first construct a lon/lat grid
 grid_data=expand.grid(lon=seq(110,155,by=0.2),lat=seq(-45,-10,by=0.1))
 grid_data=cbind(grid_data,intersect_points(grid_data[,c("lat","lon")],env_layers)) ## look up environmental data
 grid_data$height=predict(fit,newdata=grid_data) ## use model to predict height

 (ggplot(grid_data,aes(x=lon,y=lat))+geom_tile(mapping=aes(fill=height))+scale_fill_gradientn(limit=c(0,50),na.value="transparent",colours=c("#709959","#A3C77D","#D6ED9A","#F0E48D","#E6BA85","#C9957F","#D6AD94","#F5E1C4","#FAF1E3")))
	## see http://untan.gl/articles/2014/10/08_using-ala4r-mapping-eucalypt-height.html

	## load the ALA4R package
	library(ALA4R)
	## check that it's version 1.04 or later: the "guid" attribute on the sites_by_species data was added in that version
	if (as.numeric(packageDescription("ALA4R",fields="Version"))<1.04)
	stop("reinstall ALA4R to get v1.04 or later")

	ala_config(cache_directory=file.path("~","data","ala_cache"))

	## some other packages that we'll use
	library(stringr)
	library(plyr)
	library(ggplot2)

	## define the text searches (regular expressions) that we use to extract height information from a species profile
	htre=data.frame(re=c('(growing to\|grows to\|heights of up to)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)'),col_index=c(3),type=c("maximum"),stringsAsFactors=FALSE)
	htre=rbind(htre,list('(tree\|shrub)[[:punct:][:space:]]+(up to\|to)[[:space:]]([\\.[:digit:]]+)[[:space:]](m\|metres)',4,"maximum"))
	htre=rbind(htre,list('(tree\|trees\|shrub\|shrubs\|grows\|grow\|growing)[[:space:]]+to[[:space:]]+(roughly\|about\|around\|over)[[:space:]]([\\.[:digit:]]+)[[:space:]](m\|metres)',4,"maximum")) ## text like "tree to about 15 metres, grows to about 10m"
	htre=rbind(htre,list('(tree\|trees\|shrub\|shrubs)[[:space:]]+(about\|around\|roughly\|over)[[:space:]]([\\.[:digit:]]+)[[:space:]](m\|metres)[[:space:]]+(high\|tall\|in height)',4,"maximum")) ## text like "tree about 4 m high"
	htre=rbind(htre,list('(to\|may reach\|can reach\|up to\|reaching\|shrub\|tree)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)[[:space:]]+(high\|tall\|in height)',3,"maximum"))
	htre=rbind(htre,list('(to\|may reach\|can reach\|up to\|reaching\|shrub\|tree)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)[[:space:]]+or more (high\|tall\|in height)',3,"maximum")) ## e.g. "reaching 80m or more in height"
	htre=rbind(htre,list('(to\|may reach\|can reach\|up to\|reaching\|growing to\|grows to)[[:space:]]+a[[:space:]]+height[[:space:]]+of[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)',3,"maximum"))
	htre=rbind(htre,list('(to\|may reach\|can reach\|up to\|reaching\|growing to\|grows to)[[:space:]]+a[[:space:]]+maximum[[:space:]]+height[[:space:]]+of[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)',3,"maximum"))
	htre=rbind(htre,list('(to\|may reach\|can reach\|up to\|reaching\|growing to\|shrub\|shrubs\|tree\|trees\|typically\|generally\|usually\|commonly\|about)[[:space:][:punct:]]+[\\.[:digit:]]+[[:space:]][–-][[:space:]]([\\.[:digit:]]+)[[:space:]]*(m\|metres)[[:space:]]+(high\|tall\|in height)',3,"maximum"))
	htre=rbind(htre,list('(to\|may reach\|can reach\|reaches\|up to\|reaching\|growing to\|shrub\|shrubs\|tree\|trees)[[:space:][:punct:]]+(typically\|generally\|usually\|commonly\|about\|of about)?[[:space:]][\\.[:digit:]]+[[:space:]][–-][[:space:]]([\\.[:digit:]]+)[[:space:]](m\|metres)[[:space:]]+(high\|tall\|in height)',4,"maximum")) ## e.g. "shrub, typically 2-3 metres high", "tree of about 6-9 metres in height", "reaches about 5-7 metres in height"
	htre=rbind(htre,list('(height\|shrub\|tree\|shrubs\|trees)[[:space:][:punct:]]+(can range\|ranging\|mostly seen)[[:space:]]+between[[:space:]][\\.[:digit:]]+[[:space:]](–\|and\|-)[[:space:]]([\\.[:digit:]]+)[[:space:]](m\|metres)',5,"maximum")) ## "height can range between 2 and 25 metres", "tree ranging between 10 and 20 metres"
	htre=rbind(htre,list('(grows to\|grow to\|growing to\|grow)[[:space:]]+between[[:space:]][\\.[:digit:]]+[[:space:]][(m\|metres)]?[[:space:]](–\|and\|-)[[:space:]]([\\.[:digit:]]+)[[:space:]]*(m\|metres)',4,"maximum"))
	htre=rbind(htre,list('(occasionally reaching)[[:space:]]+([\\.[:digit:]]+)[[:space:]]*(m\|metres)',3,"exceptional maximum")) ## e.g. "occasionally reaching 50 m"

	## a function to apply those regular expressions
	extract_heights=function(x){
	x=x$simpleProperties$value
	x=str_replace_all(x,"\\([[:digit:][:space:]–-]+(in\|inches\|ft\|feet)\\)","") ## some descriptions give heights in metres and then repeat in imperial units - remove these
	out=data.frame()
	processed_x=x
	for (k in 1:nrow(htre)) { ## iterate through each regular expression in turn
	this_match=str_match_all(processed_x,ignore.case(htre[k,]$re))
	this_match=ldply(this_match,function(z)if (is.null(nrow(z))) NULL else data.frame(type=htre[k,]$type,matched=z[,1],value=as.numeric(z[,htre[k,]$col_index])))
	if (nrow(this_match)>0) {
	out=rbind(out,this_match)
	}
	processed_x=str_replace_all(processed_x,ignore.case(htre[k,]$re),"") ## remove matches as we go, so that we don't match multiple times on the same text
	}
	out
	}


	## download our sites-by-species matrix, using a 0.5-degree grid cell size
	ss=sites_by_species("genus:Eucalyptus",wkt="POLYGON((110 -45,155 -45,155 -10,110 -10,110 -45))",gridsize=0.5)
	## convert coordinates to cell centres
	ss$longitude=ss$longitude+0.25
	ss$latitude=ss$latitude+0.25

	all_guids=attr(ss,"guid") ## the identifier of each eucalypt taxon in the ss dataframe

	guid_heights=rep(NA,length(all_guids))
	for (k in 1:length(all_guids)) {
	## iterate through each identifier and extract the mean tree height
	x=species_info(guid=all_guids[k])
	temp=extract_heights(x)
	guid_heights[k]=mean(temp$value[temp$type=="maximum"])
	}

	## now, for each column in our sites-by-species matrix, calculate the average height of the species present, weighted by their prevalences
	cellheight=apply(ss[,3:ncol(ss)],1,function(z)sum(z*guid_heights,na.rm=TRUE)/sum(z,na.rm=TRUE))
	## there are a few cells that lie over the ocean (errors in record location) --- almost all of these have only a single species, so we'll remove any cell with only one species (this also removes some cells over land)
	celln=rowSums(ss[,3:ncol(ss)]>0)
	cellheight[celln<2]=NA

	## plot the result
	(ggplot(data.frame(lon=ss$longitude,lat=ss$latitude,height=cellheight),aes(x=lon,y=lat))+geom_tile(mapping=aes(fill=height))+scale_fill_gradientn(limit=c(0,50),na.value="transparent",colours=c("#709959","#A3C77D","#D6ED9A","#F0E48D","#E6BA85","#C9957F","#D6AD94","#F5E1C4","#FAF1E3")))


	## Part 2: exploring the patterns

	## pick some environmental layers. See ala_fields("layers") for a full list
	env_layers=c("Precipitation - annual","Radiation - annual mean (Bio20)","Elevation ","Temperature - annual max mean","Wind speed - annual mean 9am or 3pm")
	working_data=na.omit(xh)
	## get the value of each of those environmental layers at each of our sample points
	ep=intersect_points(working_data[,c("lat","lon")],env_layers)
	working_data=na.omit(cbind(working_data,ep))

	## fit a random forest regression model
	library(randomForest)
	fit=randomForest(height~precipitationAnnual+radiationAnnualMeanBio20+elevation+temperatureAnnualMaxMean+windSpeedAnnualMean9amOr3pm,data=working_data,importance=TRUE)

	## plot the partial effects
	imp=importance(fit)
	op=par(mfrow=c(3,2))
	for (i in sort(imp[,1],index.return=TRUE,decreasing=TRUE)$ix) {
	thisvar=rownames(imp)[i]
	partialPlot(fit,working_data,names(working_data)[names(working_data)==thisvar],xlab=sprintf("%s (importance %.1f)",thisvar,imp[i,1]),main="")
	}
	par(op)

	## use the model to predict across Australia - first construct a lon/lat grid
	grid_data=expand.grid(lon=seq(110,155,by=0.2),lat=seq(-45,-10,by=0.1))
	grid_data=cbind(grid_data,intersect_points(grid_data[,c("lat","lon")],env_layers)) ## look up environmental data
	grid_data$height=predict(fit,newdata=grid_data) ## use model to predict height

	(ggplot(grid_data,aes(x=lon,y=lat))+geom_tile(mapping=aes(fill=height))+scale_fill_gradientn(limit=c(0,50),na.value="transparent",colours=c("#709959","#A3C77D","#D6ED9A","#F0E48D","#E6BA85","#C9957F","#D6AD94","#F5E1C4","#FAF1E3")))