Last active
September 30, 2021 02:51
-
-
Save bhoung/5596282 to your computer and use it in GitHub Desktop.
Example 2. Running trueskill algorithm on a tennis tournament.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This second example runs Trueskill on a tennis tournament, the Australian Open. | |
# Note that actual computation is commented out as it takes about ~40 seconds to | |
# update skill ratings over 127 matches. | |
library(trueskill) | |
# Data format of ausopen2012 is: Player, Opponent, Margin, Round, WRank, LRank | |
data("ausopen2012") | |
# create match_id in order to reshape | |
data$match_id <- row.names(data) | |
# reshape wide to long on match_id such that we have | |
# 2 rows per match, 1 with Player1 as Player and 1 with | |
# Player2 as Opponent and vice versa. | |
data <- data[c("Winner", "Loser", "Round", "WRank", "LRank")] | |
data <- reshape(data, | |
idvar = "match_id", | |
varying = list(c(1, 2), c(2, 1), c(4, 5), c(5,4)), | |
v.names = c("Player", "Opponent", "WRank", "LRank"), | |
new.row.names = 1:1000, | |
timevar = "t", | |
direction = "long") | |
# data comes preformatted with winner in Player column | |
# set margin to 1 for win and -1 for loss. | |
data$margin[data$t == "1"] <- 1 | |
data$margin[data$t != "1"] <- -1 | |
data$t <- NULL | |
data$mu1 <- NA | |
data$sigma1 <- NA | |
data$mu2 <- NA | |
data$sigma2 <- NA | |
# For the first round, set Mu to 300 less the ATP rank | |
# Skill tends to be stable at the higher rankings (descending from 1), | |
# so set sigma at mu less mu / 3, rather than the recommended mu / 3 | |
data[c("mu1","sigma1")] <- c(300 - data$WRank, | |
round(300 - data$WRank - ((300 - data$WRank) / 3), 1)) | |
data[c("mu2","sigma2")] <- c(300 - data$LRank, | |
round(300 - data$LRank - ((300 - data$WRank) / 3), 1)) | |
data[!data$Round == "1st Round",][c("mu1","sigma1")] <- c(NA, NA) | |
data[!data$Round == "1st Round",][c("mu2","sigma2")] <- c(NA, NA) | |
parameters <- Parameters() | |
# Trueskill expects data with columns mu1, sigma1, mu2 and sigma2, | |
# will set mu and sigma to 25 and 25 / 3 if NA. | |
# data <- Trueskill(data, parameters) | |
# top4 <- subset(data, Player == "Djokovic N." | Player == "Nadal R." | | |
# Player == "Federer R." | Player == "Murray A." ) | |
# top4 <- top4[order(top4$Player,top4$Round),] | |
# subset(top4, Player == "Djokovic N.") | |
# For a visualisation, load up our favourite package ggplot2... | |
# library(ggplot2) | |
# g1 <- ggplot(top4, aes(x = Round, y = mu1, group = Player, colour = Player)) + | |
# geom_point(aes(colour=factor(Player))) + geom_line(aes()) | |
# g1 | |
# Without having adjusted the input parameters, Trueskill does not predict | |
# match outcomes well, as it appears that facing stiffer opposition | |
# (higher skilled players) tends to diminish a player's chances of | |
# progressing in the subsequent round. | |
# This is consistent with commentators describing players with softer draws and | |
# playing shorter matches (3 sets as opposed to 5 sets) as being | |
# fresher in later rounds. | |
# The other feature is that the skill of the better players is weighted | |
# towards the losing player even if the better player wins, so we have | |
# this effect of the 4 semifinalists having their skills dropping as the | |
# tournament progresses. This could be symptomatic of high starting values, | |
# which is necessary due to some of the very low rankings. | |
# E.g Lleyton Hewitt with 181. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment