roblanf · August 8, 2023 16:38 · mbutler808 · Mar 30, 2023 · mbutler808 · Mar 30, 2023
diff --git a/concordance.R b/concordance.R
 library(viridis)
 library(ggplot2)
 library(dplyr)
 library(ggrepel)
 library(GGally)
 library(entropy)

 # read the data
 d = read.delim("concord.cf.stat", header = T, comment.char=‘#')
               
 names(d)[10] = "bootstrap"
 names(d)[11] = "branchlength"


 # plot the values
 ggplot(d, aes(x = gCF, y = sCF)) + 
    geom_point(aes(colour = bootstrap)) + 
    scale_colour_viridis(direction = -1) + 
    xlim(0, 100) +
    ylim(0, 100) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed")

 # label the points
 ggplot(d, aes(x = gCF, y = sCF, label = ID)) + 
    geom_point(aes(colour = bootstrap)) + 
    scale_colour_viridis(direction = -1) + 
    xlim(0, 100) +
    ylim(0, 100) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    geom_text_repel()


 # find a branch of interest
 d[d$ID==327,]



 # test ILS assumptions

 # first we use a slightly modified chisq function
 # which behaves nicely when you feed it zeros
 chisq = function(DF1, DF2, N){
    tryCatch({
        # converts percentages to counts, runs chisq, gets pvalue
        chisq.test(c(round(DF1*N)/100, round(DF2*N)/100))$p.value
    },
    error = function(err) {
        # errors come if you give chisq two zeros
        # but here we're sure that there's no difference
        return(1.0)
    })
 }

 e = d %>% 
    group_by(ID) %>%
    mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>%
    mutate(sEF_p = chisq(sDF1, sDF2, sN))
    

 subset(data.frame(e), (gEF_p < 0.05 | sEF_p < 0.05))



 # calculate internode certainty

 IC = function(CF, DF1, DF2, N){
    
    # convert to counts
    X = CF * N / 100
    Y = max(DF1, DF2) * N / 100
        
    pX = X/(X+Y)
    pY = Y/(X+Y)
    
    IC = 1 + pX * log2(pX) +
             pY * log2(pY)

    return(IC)
 }


 e = e %>% 
    group_by(ID) %>%
    mutate(gIC = IC(gCF, gDF1, gDF2, gN)) %>%
    mutate(sIC = IC(sCF, sDF1, sDF2, sN))


 # entropy

 ENT = function(CF, DF1, DF2, N){
    
    CF = CF * N / 100
    DF1 = DF1 * N / 100
    DF2 = DF2 * N / 100
    
    return(entropy(c(CF, DF1, DF2)))

 }

 ENTC = function(CF, DF1, DF2, N){
    
    maxent = 1.098612
    
    CF = CF * N / 100
    DF1 = DF1 * N / 100
    DF2 = DF2 * N / 100
    
    ent = entropy(c(CF, DF1, DF2))
    
    entc = 1 - (ent / maxent)
    
    return(entc)
    
 }

 e = e %>% 
    group_by(ID) %>%
    mutate(sENT = ENT(sCF, sDF1, sDF2, sN)) %>%
    mutate(sENTC = ENTC(sCF, sDF1, sDF2, sN))
    
 head(e)

 # plot it
 ggpairs(e, columns = c(2, 6, 10, 12, 13, 14, 15, 16, 17))
	library(viridis)
	library(ggplot2)
	library(dplyr)
	library(ggrepel)
	library(GGally)
	library(entropy)

	# read the data
	d = read.delim("concord.cf.stat", header = T, comment.char=‘#')

	names(d)[10] = "bootstrap"
	names(d)[11] = "branchlength"


	# plot the values
	ggplot(d, aes(x = gCF, y = sCF)) +
	geom_point(aes(colour = bootstrap)) +
	scale_colour_viridis(direction = -1) +
	xlim(0, 100) +
	ylim(0, 100) +
	geom_abline(slope = 1, intercept = 0, linetype = "dashed")

	# label the points
	ggplot(d, aes(x = gCF, y = sCF, label = ID)) +
	geom_point(aes(colour = bootstrap)) +
	scale_colour_viridis(direction = -1) +
	xlim(0, 100) +
	ylim(0, 100) +
	geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
	geom_text_repel()


	# find a branch of interest
	d[d$ID==327,]



	# test ILS assumptions

	# first we use a slightly modified chisq function
	# which behaves nicely when you feed it zeros
	chisq = function(DF1, DF2, N){
	tryCatch({
	# converts percentages to counts, runs chisq, gets pvalue
	chisq.test(c(round(DF1N)/100, round(DF2N)/100))$p.value
	},
	error = function(err) {
	# errors come if you give chisq two zeros
	# but here we're sure that there's no difference
	return(1.0)
	})
	}

	e = d %>%
	group_by(ID) %>%
	mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>%
	mutate(sEF_p = chisq(sDF1, sDF2, sN))


	subset(data.frame(e), (gEF_p < 0.05 \| sEF_p < 0.05))



	# calculate internode certainty

	IC = function(CF, DF1, DF2, N){

	# convert to counts
	X = CF * N / 100
	Y = max(DF1, DF2) * N / 100

	pX = X/(X+Y)
	pY = Y/(X+Y)

	IC = 1 + pX * log2(pX) +
	pY * log2(pY)

	return(IC)
	}


	e = e %>%
	group_by(ID) %>%
	mutate(gIC = IC(gCF, gDF1, gDF2, gN)) %>%
	mutate(sIC = IC(sCF, sDF1, sDF2, sN))


	# entropy

	ENT = function(CF, DF1, DF2, N){

	CF = CF * N / 100
	DF1 = DF1 * N / 100
	DF2 = DF2 * N / 100

	return(entropy(c(CF, DF1, DF2)))

	}

	ENTC = function(CF, DF1, DF2, N){

	maxent = 1.098612

	CF = CF * N / 100
	DF1 = DF1 * N / 100
	DF2 = DF2 * N / 100

	ent = entropy(c(CF, DF1, DF2))

	entc = 1 - (ent / maxent)

	return(entc)

	}

	e = e %>%
	group_by(ID) %>%
	mutate(sENT = ENT(sCF, sDF1, sDF2, sN)) %>%
	mutate(sENTC = ENTC(sCF, sDF1, sDF2, sN))

	head(e)

	# plot it
	ggpairs(e, columns = c(2, 6, 10, 12, 13, 14, 15, 16, 17))