Last active
August 8, 2023 16:38
-
-
Save roblanf/992ed3b37eca757ce91b9ed5da15554e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(viridis) | |
library(ggplot2) | |
library(dplyr) | |
library(ggrepel) | |
library(GGally) | |
library(entropy) | |
# read the data | |
d = read.delim("concord.cf.stat", header = T, comment.char=‘#') | |
names(d)[10] = "bootstrap" | |
names(d)[11] = "branchlength" | |
# plot the values | |
ggplot(d, aes(x = gCF, y = sCF)) + | |
geom_point(aes(colour = bootstrap)) + | |
scale_colour_viridis(direction = -1) + | |
xlim(0, 100) + | |
ylim(0, 100) + | |
geom_abline(slope = 1, intercept = 0, linetype = "dashed") | |
# label the points | |
ggplot(d, aes(x = gCF, y = sCF, label = ID)) + | |
geom_point(aes(colour = bootstrap)) + | |
scale_colour_viridis(direction = -1) + | |
xlim(0, 100) + | |
ylim(0, 100) + | |
geom_abline(slope = 1, intercept = 0, linetype = "dashed") + | |
geom_text_repel() | |
# find a branch of interest | |
d[d$ID==327,] | |
# test ILS assumptions | |
# first we use a slightly modified chisq function | |
# which behaves nicely when you feed it zeros | |
chisq = function(DF1, DF2, N){ | |
tryCatch({ | |
# converts percentages to counts, runs chisq, gets pvalue | |
chisq.test(c(round(DF1*N)/100, round(DF2*N)/100))$p.value | |
}, | |
error = function(err) { | |
# errors come if you give chisq two zeros | |
# but here we're sure that there's no difference | |
return(1.0) | |
}) | |
} | |
e = d %>% | |
group_by(ID) %>% | |
mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>% | |
mutate(sEF_p = chisq(sDF1, sDF2, sN)) | |
subset(data.frame(e), (gEF_p < 0.05 | sEF_p < 0.05)) | |
# calculate internode certainty | |
IC = function(CF, DF1, DF2, N){ | |
# convert to counts | |
X = CF * N / 100 | |
Y = max(DF1, DF2) * N / 100 | |
pX = X/(X+Y) | |
pY = Y/(X+Y) | |
IC = 1 + pX * log2(pX) + | |
pY * log2(pY) | |
return(IC) | |
} | |
e = e %>% | |
group_by(ID) %>% | |
mutate(gIC = IC(gCF, gDF1, gDF2, gN)) %>% | |
mutate(sIC = IC(sCF, sDF1, sDF2, sN)) | |
# entropy | |
ENT = function(CF, DF1, DF2, N){ | |
CF = CF * N / 100 | |
DF1 = DF1 * N / 100 | |
DF2 = DF2 * N / 100 | |
return(entropy(c(CF, DF1, DF2))) | |
} | |
ENTC = function(CF, DF1, DF2, N){ | |
maxent = 1.098612 | |
CF = CF * N / 100 | |
DF1 = DF1 * N / 100 | |
DF2 = DF2 * N / 100 | |
ent = entropy(c(CF, DF1, DF2)) | |
entc = 1 - (ent / maxent) | |
return(entc) | |
} | |
e = e %>% | |
group_by(ID) %>% | |
mutate(sENT = ENT(sCF, sDF1, sDF2, sN)) %>% | |
mutate(sENTC = ENTC(sCF, sDF1, sDF2, sN)) | |
head(e) | |
# plot it | |
ggpairs(e, columns = c(2, 6, 10, 12, 13, 14, 15, 16, 17)) | |
Same issue with line 124, I think the column numbers have changed. I would suggest:
ggpairs(e, columns = c("gCF", "sCF", "bootstrap", "gEF_p", "sEF_p", "gIC", "sIC", "sENT", "sENTC"))
Thank you for this really terrific script, and even more clarifying blog post! Thank you, Marguerite
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For lines 11, 12: Columns 10 and 11 point to gN and sCF, I think you mean 18 and 19, but to be safe, why not: