Tools For Evo Devo
``````##############################################################
# =========================================================
# An Evo-Devo Analysis in R: an example with cone snails
# =========================================================
# by Nick Matzke (and whoever else adds to this PhyloWiki page)
# matzkeATberkeley.edu
# March 2011
#
# Please link/cite if you use this, email me if you have
#   thoughts/improvements/corrections.
#
##############################################################
#
# Copyright: this lab contains/uses unpublished research data &
# analyses.
#
# IB200b students are of course free to re-use the code, others should
# contact me about re-use of the code or data.
#
# This work was done in collaboration with Monty Slatkin, George
# Oster, and Neil Gong.
#
###################################################################
#
# Assignment for Tuesday, March 8
#
# In this lab, I will take you through a real analysis I am
# working on that is basically applying ancestral character
# reconstruction methods and model selection to an evo-devo
# question.
#
# George Oster's lab (Oster is a biophysicist) developed a shell-
# growth model for cone snails that runs in Matlab.  (Google cone
# snails to look at some shells.)
#
# Basically, given a series of 24 input parameters, the model grows
# a shell, complete with the pattern, which might be stripes, dots,
# triangles, blank, etc.  The first 19 parameters control various
# signal and response curves that cells of the cone snail mantle,
# under the model issue and receive, which then control the release
# of pigment.
#
# The last 5 parameters deal with shell shape:
#
# kz, kr, and turn are from Raup's famous shell-shape model, and
# control the # of turns, the angle, etc. of the shell
#
# N and T are starting conditions for the model (# of cells at the
# growing edge, and the time point they start at).
#
#
#
# Once they built the model, Oster's lab then took a bunch of living
# species, and figured out the parameters that grow patterns to match
# each of these species.
#
# Oster then came over to Integrative Biology to "do some evolution
# with this".  Some obvious questions are:
#
# 1. Is there any phylogenetic signal in these parameters, which
# after all are not direct measurements, but parameters that were
# fit to a computer model by eye.
#
# 1a. Are the parameters statistically independent, or highly
# correlated with each other?
#
# 2. Is Brownian motion a reasonable model under which to
# conduct ancestral state reconstruction?
#
# 3. What model of evolution best describes each parameter?
# Available models are:
# - Brownian motion (BM);
#
# - the Ornstein−Uhlenbeck (OU) stabilizing selection model;
#
# - the lambda model which rescales internal branch lengths by
#   a linear fraction;
#
# - the kappa model (green) which rescales each branch length by a power
#   equal to the kappa parameter, and which becomes a speciational model
#   as kappa approaches 0;
#
# - delta (yellow) which focuses change towards the base or tips;
#
# - early burst (EB, cyan) which has an initial high rate of change that
#   then declines;
#
# - and white noise (white), where observations are produced by a normal
#   distribution with no tree structure, which represents the situation
#   of no phylogenetic signal.
#
# See references under the function fitContinuous.
#
# Assignment:
#
# Work through this code, and try to understand what each
# stage does.
#
# Have a look at the final Likelihood Ratio Test results, and the model
# comparison figure which uses AIC, and email me your verbal summary of
# those results.  E.g., which model is best, how confidence should the
# reader be in those conclusions, and what are the implications in
# terms of understanding the evolutionary processes involved,
# and the likely accuracy of using Brownian motion for ancestral
# character reconstruction.
#
#
# Start by looking at the graphical file Neil_tree.png on the website.
# This has photos of the cone snail species, and the matching shells
# grown in the computer, on a DNA phylogeny.
#
# The parameters are stored in parameters_njm7.txt, which this script
# takes as input, along with the DNA tree, tr.newick, and the
# ultrametricized version, tr2.newick.
#
# Just to cut to the end, The final results figure is at:
# tree_DNA_num_ML_inv2.png
#
###################################################################

# Stuff that is handy at the beginning of a script
######################################################
# R, close all devices/windows/ quartz windows
graphics.off()

# Turn off all BS stringsAsFactors silliness
options(stringsAsFactors = FALSE)
######################################################

# Working directories:
# One of the first things you want to do, usually, is
# decide on your working directory

# On my Mac, this is the directory
wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab10"

# On a PC, you might have to specify paths like this:
#wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab10"

#desc: set working directory
setwd(wd)

#desc: get working directory
getwd()

#desc: list the files in the directory
list.files()

# Sourcing scripts
# Local sourcing:
# source a script (windows machines require \\ instead of /)
# sourcedir = '/_njm/'
#source3 = '_genericR_v1.R'
#source(paste(sourcedir, source3, sep=""))

# You need the APE package to source genericR_v1.R scripts
install.packages("ape")
library(ape)

# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_conus_analyses_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R")

# Setup:
# install these, if needed
library(ape)
library(phangorn)      # helps with "midpoint" function
library(picante)      # for "Kcalc" and "phylosignal", i.e. calculation of the kappa statistic for continuous parameters
library(geiger)     # for e.g. fitContinuous

#########################################
# Plot the original DNA tree
#########################################
trfn = "tr.newick"
plot(tr)
title("Original DNA tree (digitized from Nam et al. 2009\nusing TreeRogue 0.1)\n(Note: axis scale is in digitization coordinates)")
axisPhylo()

# Get the parameters file

# identify the types of params
uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia", "i1s1g", "i1s2eg", "i1s2ig", "i1s3g", "i1sf2ia", "i2s1g", "i2s2eg", "i2s2ig", "i2s3g", "i2sf2ia")
continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
shell_params_names = c("kz", "kr", "turn")
starting_params_names = c("N", "T")
complex_params_names = c("s3n_std", "s3t_spd")
clad_chars_names = c("stripes", "triangles", "dots", "color", "conical", "food")
complex1_names = names(params_orig)[(ncol(params_orig)-25*2) : (ncol(params_orig)-26)]
complex2_names = names(params_orig)[(ncol(params_orig)-25) : ncol(params_orig)]
nonsucky_names = c(continuous_params_names, shell_params_names, starting_params_names, complex_params_names, clad_chars_names)
all_cont_names = c(continuous_params_names, shell_params_names, starting_params_names)

# remove the uniform params
cols_tokeep = names(params_orig) %in% uniform1_params_names == FALSE

all_params = params_orig[, cols_tokeep]
row.names(all_params) = all_params\$species
(all_params)

###################################
# Drop the tips we're not using
###################################
all_params_orig = all_params
#n1 = xy3\$nodenames[xy3\$nodenames != ""]
n1 = tr\$tip.label
n2 = all_params\$species
n1_to_keep = n1 %in% n2
print(cbind(n1, n1_to_keep))

# drop the tips we're not using
tr2 = drop.tip(tr, n1[n1_to_keep == FALSE])

####################################################################
# reorder the parameters to match the order of tips in the tree
# (this is CRUCIAL!!)
####################################################################
# tree tip order, from bottom to top:
(labels1 = tr2\$tip.label)

# order of rows in char_dtf
(labels2 = row.names(all_params))

# reorder char_dtf to match tip.labels
new_order = match(labels1, labels2)
all_params = all_params_orig[new_order, ]

params = all_params

# Cut params down to just continuous parameters for correlation analysis
params = params[, all_cont_names]

####################################

#pairs(params[, -1])
cps = cor(params)
cps = as.data.frame(cps)
names_cps = names(cps)
#dim(params[, -1])

# Rank pairwise correlations
pairwise_cors_list = c()
for (i in 2:nrow(cps))
{
if ((i+1) < ncol(cps))
{
for (j in (i+1):ncol(cps))
{
tmprow = c(names_cps[i], names_cps[j], cps[i,j], abs(cps[i, j]))
pairwise_cors_list = rbind(pairwise_cors_list, tmprow)
}
}
}
row.names(pairwise_cors_list) = NULL
pairwise_cors_list = as.data.frame(pairwise_cors_list)
names(pairwise_cors_list) = c("x", "y", "cor", "abs_cor")
#rank(pairwise_cors_list\$abs_cor)
order_cors = order(pairwise_cors_list\$abs_cor, decreasing=TRUE)

pairwise_cors_list = pairwise_cors_list[order_cors, ]

print("Top correlations:")
(pairwise_cors_list[1:20, ])

######################################################
# Phylogenetically Independent Contrasts
######################################################
# we are going to use an ultrametric tree:
#chronogram(tr2, scale = 1, expo = 2, minEdgeLength = 1e-06)
write.tree(chtr2, file="chtr2.newick")
plot(chtr2)
title("DNA tree, ultrametricized with APE's chronogram() function: NPRS\n(nonparametric rate smoothing), basically a least squares fit after Sanderson 1997")
axisPhylo()

pic_params = c()
for (i in 1:ncol(params))
{
x = pic(params[,i], chtr2)
pic_params = cbind(pic_params, x)
}

pic_params = as.data.frame(pic_params)
names(pic_params) = names(params)

#pairs(pic_params)
cor(pic_params)

# Plot all params vs. all params
tmpparams = params[, all_cont_names]
tmpparams = dfnums_to_numeric(tmpparams)
tmpparams[tmpparams == "?"] = NA
pairs.panels(tmpparams, smooth=FALSE)
title("Regular correlations")
pairs.panels(pic_params, smooth=FALSE)
title("PIC correlations")

# Assemble the tables of correlations for params and PICs of params
res1_txt_table = c()
respic_txt_table = c()
for (i in 1:ncol(params))
{
for (j in i:ncol(params))
{
if (i == j)
{
next()
}
cat("x=", names(params)[i], "        vs.        y=", names(params)[j], "\n", sep="")

res1 = cor.test(params[, i], params[, j])
respic = cor.test(pic_params[, i], pic_params[, j])
#print()
#print()

lm_res1 = lm(params[, j] ~ params[, i])
lm_respic = lm(pic_params[, j] ~ pic_params[, i])

res1_txt = c(names(params)[i], names(params)[j], res1\$statistic, res1\$parameter, res1\$p.value, res1\$estimate, res1\$null.value, res1\$alternative, res1\$method, res1\$data.name, res1\$conf.int, res1\$conf.int, lm_res1\$coefficients, lm_res1\$coefficients)

respic_txt = c(names(params)[i], names(params)[j], respic\$statistic, respic\$parameter, respic\$p.value, respic\$estimate, respic\$null.value, respic\$alternative, respic\$method, respic\$data.name, respic\$conf.int, respic\$conf.int, lm_respic\$coefficients, lm_respic\$coefficients)

res1_txt_table = rbind(res1_txt_table, res1_txt)
respic_txt_table = rbind(respic_txt_table, respic_txt)
}
}

res1_txt_table = as.data.frame(res1_txt_table, row.names=1:nrow(res1_txt_table))
names(res1_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))

respic_txt_table = as.data.frame(respic_txt_table, row.names=1:nrow(respic_txt_table))
names(respic_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))

res1_txt_table = dfnums_to_numeric(res1_txt_table)
respic_txt_table = dfnums_to_numeric(respic_txt_table)

res1_txt_table\$pval_bonf = res1_txt_table\$pval * nrow(res1_txt_table)
respic_txt_table\$pval_bonf = respic_txt_table\$pval * nrow(respic_txt_table)

res1_txt_table = res1_txt_table[order(respic_txt_table\$pval_bonf),]
respic_txt_table = respic_txt_table[order(respic_txt_table\$pval_bonf),]

(res1_txt_table[1:20,])
(respic_txt_table[1:20,])

# sort by ascending p-value
res1_txt_table_sort = res1_txt_table[order(res1_txt_table\$pval), ]
respic_txt_table_sort = respic_txt_table[order(respic_txt_table\$pval), ]

(res1_txt_table_sort[1:20,])
(respic_txt_table_sort[1:20,])

x = res1_txt_table\$cor_estimate
y = respic_txt_table\$cor_estimate
xlab = "r (regular)"
ylab="r (PIC)"
xlim = c(-1, 1)
ylim = c(-1, 1)
titlestart_txt = "correlation (r) estimates for pairwise correlations,\nnormal vs. phylogenetically independent contrasts;\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)

x = res1_txt_table\$pval
y = respic_txt_table\$pval
xlab="p-val (regular)"
ylab="p-val (PIC)"
xlim = c(0, 1)
ylim = c(0, 1)
titlestart_txt = "p-values for pairwise correlations,\nnormal vs. phylogenetically independent contrasts\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)

# Write the tables to text
write.table(res1_txt_table, "res1_txt_table.txt")
write.table(respic_txt_table, "respic_txt_table.txt")
############################################################

#####################################################
# Assess the parsimony stats of the discrete characters
#####################################################

char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig

# Make the final figure comparing the trees

# quite fast
#(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
write_table_good(parstats, "parstats_v8.txt")

# slower
#(parstats = parsim_stats(tr2, char_dtf))

# Now, resample (takes a few seconds)
num_resamps = 100
parstats_null = array(data=NA, dim=c(ncol(char_dtf)+1, 6, num_resamps+1))
for (i in 1:num_resamps)
{
tmpsamp = resample_dtf_matrix(char_dtf)
parstats_null[, , i] = as.matrix(parsim_stats_fast(tr2, tmpsamp))
}
parstats_null[, , num_resamps+1] = as.matrix(parstats)

summary_parstats_means = parstats
summary_parstats_sds = parstats
summary_parstats_pvals = parstats

# do histograms of the six different distances
# set outer margins & number of subplots
par(oma=c(1,1,5,1), mfrow=c(2,3))
for (i in 1:nrow(parstats))
{
for (j in 1:ncol(parstats))
{
x = parstats_null[i, j, ]
summary_parstats_means[i, j] = mean(x)
summary_parstats_sds[i, j] = sd(x)

observed_val = parstats_null[i, j, num_resamps+1]
pval = round(empirical_pval_fast(x, observed_val), 4)

# redo pval depending on whether value should be low or high
if (j > 3)
{
pval = round(1-pval, 4)
}

summary_parstats_pvals[i, j] = pval

tmp_title = paste("non-parametric p-value = ", pval, sep="")
h = hist(x, main=tmp_title, breaks=50, cex.main=1, xlab=names(parstats)[j])

arrowtop = 0.3*max(h\$counts, na.rm=TRUE)
arrowbot = 0.03*max(h\$counts, na.rm=TRUE)
arrows(observed_val, arrowtop, observed_val, arrowbot, col="blue", lwd=4)
}
titletxt = paste("Parsimony stats for ", row.names(parstats)[i], sep="")
mtext(titletxt, outer=TRUE)
}

write_table_good(summary_parstats_means, "parstats_means_v8.txt")
write_table_good(summary_parstats_sds, "parstats_sds_v8.txt")
write_table_good(summary_parstats_pvals, "parstats_pvals_v8.txt")

#######################################################
# Ancestral character reconstruction --  continuous, plots
#######################################################

par(mfrow=c(1,2))

# character mapping
# uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia")
# continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
# shell_params_names = c("kz", "kr", "turn")
# starting_params_names = c("N", "T")
# complex_params_names = c("s3n_std", "s3t_spd")

# continuous parameter characters
colnames_to_use = c(continuous_params_names, shell_params_names, starting_params_names)
char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig

#################
# reorder char_dtf
################
# tree tip order, from bottom to top:
#(labels1 = chtr2\$tip.label)
# order of rows in char_dtf
#(labels2 = row.names(char_dtf))
# reorder char_dtf to match tip.labels
#new_order = match(labels1, labels2)
#char_dtf = char_dtf[new_order, ]

par(mfrow=c(6,4))
for (colnum in 1:ncol(char_dtf))
{
hist(as.numeric(char_dtf[,colnum]), 17, main=title(names(char_dtf)[colnum]) )
}

par(mfrow=c(1,1))

###########################################################################
# Compare different models for the evolution of this character
###########################################################################
runthis4 = TRUE
if (runthis4 == TRUE)
{
# models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white", "trend")
# cut "trend" model, doesn't matter for ultrametric trees (same as BM model, evidently)
models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white")
model_fits = NULL

# GET THE VARIOUS PARAMETERS ALSO

# beta (typically the rate)
# alpha (central tendency on BM walk that is proportional to alpha)

# Lambda model
# multiplies all internal branches of the tree by lambda, leaving tip
# branches as their original length
#
# kappa raises all branch lengths to the power kappa. As kappa approaches
# zero, the model becomes speciational.
#
# (apparently, kappa is saved as "lamda" in the kappa model)

# delta (delta>1, evo tipwards, < 1, evo at base)

# EB (a = proportion of initial rate represented by the end rate (?))

# White Noise (draw from mean/variance)
# mean = mean
# nv = variance (?)

# THESE BOUNDS PRODUCE PATHOLOGICAL RESULTS (on my first dataset)
bounds_list = NULL
# bounds_list\$BM = list(beta=c(1,100))
# bounds_list\$OU = list(alpha=c(0.0001, 10), beta=c(1, 100))
# bounds_list\$lambda = list(beta=c(1,100))
# bounds_list\$kappa = list(beta=c(1,100))
# bounds_list\$EB = list(beta=c(1,100))
# bounds_list\$white = NULL

# THESE BOUNDS ARE THE DEFAULTS
bounds_list = NULL

# db = default bounds
#db_beta = c(1e-08, 20)
#db_lambda = c(1e-07, 1)
#db_kappa = c(1e-06, 1)
#db_delta = c(1e-05, 2.999999)
#db_alpha = c(1e-07, 50)
#db_a = c(-3, 0)
#db_nv = c(1e-10, 100)
#db_mu = c(-100, 100)

# YOU SHOULD ALWAYS CHECK IF YOUR ESTIMATES HIT THE BOUNDS, AND
bounds_list = NULL

# db = default bounds
db_beta = c(1e-08, 1e08)
db_alpha = c(1e-07, 50)
db_lambda = c(1e-07, 1)
db_kappa = c(1e-06, 1)
db_delta = c(1e-05, 2.9999)
db_a = c(-3, 0)
db_nv = c(1e-10, 1000000)
#db_mu = c(-100, 100)

bounds_list\$BM = list(beta=db_beta)
bounds_list\$OU = list(beta=db_beta, alpha=db_alpha)
bounds_list\$lambda = list(beta=db_beta, lambda=db_lambda)
bounds_list\$kappa = list(beta=db_beta, lambda=db_kappa)
bounds_list\$EB = list(beta=db_beta, a=db_a)
bounds_list\$white = list(nv=db_nv)
# bounds_list\$model = list(mu=db_mu)

for (i in 1:length(models_list))
{
# Chose the right bounds
cmdstr = paste("tmp_bounds = bounds_list\$", models_list[i], sep="")
eval(parse(text = cmdstr))

model_fit = fitContinuous(chtr2, char_dtf, model=models_list[i], bounds = tmp_bounds)
cmdstr = paste("model_fits\$", models_list[i], " = model_fit", sep="")
eval(parse(text = cmdstr))
}
# Save R object to a file
save(model_fits, file="model_fits.Rdata")
} else {
}

model_fits_orig = model_fits
model_fits = model_fits_orig

# Get log likelihoods and AICs
results_list = c("lnl", "aic", "aicc", "k", "beta", "alpha", "lambda", "delta", "a", "mean", "nv")

# fill in blank results
for (item in results_list)
{
cmdstr = paste(item, "_matrix = matrix(NA, nrow=length(models_list), ncol=length(all_cont_names))", sep="")
eval(parse(text = cmdstr))
}

# put in the data from the model results
for (i in 1:length(models_list))
{
for (j in 1:length(all_cont_names))
{
for (item in results_list)
{
cmdstr = paste("tmp_item = model_fits\$", models_list[i], "\$", all_cont_names[j], "\$", item, sep="")
eval(parse(text = cmdstr))
if (is.null(tmp_item))
{
cmdstr = paste(item, "_matrix[i,j] = NA", sep="")
eval(parse(text = cmdstr))
} else {
cmdstr = paste(item, "_matrix[i,j] = tmp_item", sep="")
eval(parse(text = cmdstr))
}
}
}
}

# Apply the row & column names
for (item in results_list)
{
cmdstr = paste(item, "_matrix = adf(", item, "_matrix)", sep="")
eval(parse(text = cmdstr))

cmdstr = paste("names(", item, "_matrix) = names(char_dtf)", sep="")
eval(parse(text = cmdstr))

cmdstr = paste("row.names(", item, "_matrix) = models_list", sep="")
eval(parse(text = cmdstr))

# write to txt file
cmdstr = paste("write_table_good(", item, "_matrix, '", item, "_matrix.txt')", sep="")
eval(parse(text = cmdstr))
}

lnl_matrix
aic_matrix
aicc_matrix
k_matrix
beta_matrix
alpha_matrix
lambda_matrix
delta_matrix
a_matrix
mean_matrix
nv_matrix

# Compare the models using AIC
# Another way: use AIC
# For each col, subtract the min

diff_aic_matrix = aic_matrix

for (colnum in 1:ncol(aic_matrix))
{
diff_aic_matrix[, colnum] = aic_matrix[, colnum] - min(aic_matrix[, colnum], na.rm=TRUE)
}

cat("Table of difference-from-best-AIC values; 0 = best model\n")
print(diff_aic_matrix, digits=2)

# Citation for this calculation:
# http://www.brianomeara.info/tutorials/aic
#
# or:
#
#
aic_wi = exp(1) ^ (-0.5 * diff_aic_matrix)
colsum_aic_wi = apply(aic_wi, 2, sum, na.rm=TRUE)

# calculate the relative weights
aic_weights = aic_wi
for (colnum in 1:ncol(aic_wi))
{
aic_weights[,colnum] = aic_wi[,colnum] / colsum_aic_wi[colnum]
}

#
cat("These are the AIC weights, calculated according to Burnham & Anderson (2002).\n")
print(aic_weights, digits=2)

# flip aic_weights for graphing
aic_weights[nrow(aic_weights):1, ] = aic_weights
row.names(aic_weights) = row.names(aic_weights)[nrow(aic_weights):1]

#####################################
# Plot AIC weights, with labels
#####################################
# What are the best models under AIC
best_model_list = rownames(aic_weights)

best_models = rep(NA, length(colnames(aic_weights)))
for (colnum in 1:length(colnames(aic_weights)))
{
TF_row = aic_weights[,colnum] == max(aic_weights[,colnum])
best_models[colnum] = best_model_list[TF_row]
}
print(best_models)

# Get results
#pdf(file = "conus_cors_all_vs_all_v3.pdf", width=15, height=15)
doPDFs2 = TRUE
if (doPDFs2 == TRUE)
{
pdffn = "Figure_SX_model_compare_v8.pdf"
pdf(file=pdffn, width=8, height=10, paper="USr")
}

par(oma=c(1,1,1,1), mar=c(15.6, 4.1, 4.1, 7.1), mfrow = c(1,1))
par(xpd=TRUE)
tmp_colors = c("white", "cyan", "yellow", "green2", "orange", "red", "brown")
tmp_xnames = rep("", length(colnames(aic_weights)))
legend_text = rev(c("Brownian", "O-U", "lambda", "kappa", "delta", "early burst", "white noise"))
tmp_legend = list(x=38, y=0.9, pt.cex=4)

# don't plot y-axis here
par(yaxt="n")
bar_x_positions = barplot(aic_weights, names.arg=tmp_xnames, col=tmp_colors, ylab="Proportion of AIC weight", xlab="shell growth parameter", legend.text=legend_text, args.legend=tmp_legend)

# plot y-axis here
par(yaxt="s")
ticks = c(0, 0.25, 0.5, 0.75, 1)
axis(side=2, at=ticks, line=0, labels=ticks, lty = "solid", lwd = 1)

## Create plot and get bar midpoints in 'mp'
#bar_x_positions = barplot(1:length(colnames(aic_weights)))

## Set up x axis with tick marks alone
#axis(1, at = bar_x_positions, labels = FALSE)

# Create some text labels
tmp_x_labels = colnames(aic_weights)

# Plot x axis labels at mp
text(x=bar_x_positions-0.1, y=-0.015, adj=0, srt=315, labels = tmp_x_labels, xpd=TRUE)

# Plot the best models across the top
text(x=bar_x_positions-0.1, y=1.015, adj=0, srt=45, labels = best_models, xpd=TRUE)

toptxt = "(Best model under AIC)"
text(x=15, y=1.14, labels=toptxt)

caption = "Figure SX: Akaike weights of 7 models for the evolution of shell parameters.  The models \ncompared are: Brownian motion (BM, brown); the Ornstein-Uhlenbeck (OU, red) \nstabilizing selection model; the lambda model (orange) which rescales internal branch \nlengths by a linear fraction; the kappa model (green) which rescales each branch length \nby a power equal to the kappa parameter, and which becomes a speciational model as \nkappa approaches 0; delta (yellow) which focuses change towards the base or tips; \nearly burst (EB, cyan) which has an initial high rate of change that then declines; and white \nnoise (white), where observations are produced by a normal distribution with no tree  \nstructure, which represents the situation of no phylogenetic signal. Brownian motion (BM) \nhas the highest AIC weight for 63% of the shell parameters (15/24).  White noise \n(no phylogenetic signal) is superior for 25% (6/24) shell parameters."

##########################################################

# Turn off the PDF and open
if (doPDFs2 == TRUE)
{
dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)
}

##########################################################
# Do LRT on the different models
##########################################################
# Compare all models to BM
pchisq_table = matrix(NA, nrow=5, ncol=ncol(lnl_matrix))

for (colnum in 1:ncol(lnl_matrix))
{
tmp_lnls = lnl_matrix[, colnum]

# Calculate the log-likelihood differences from
# the Brownian motion model
#
# You can't compare white noise, since it's not nested
tmp_lnls_diffs = tmp_lnls[2:(length(tmp_lnls)-1)] - tmp_lnls

# take twice the difference
tmp_lnls_twice_diffs = 2 * tmp_lnls_diffs
dim(tmp_lnls_twice_diffs) = c(1, length(tmp_lnls_twice_diffs))

# Do chi-squared test with 1 degree of freedom
tmp_pchisq = apply(tmp_lnls_twice_diffs, 1, pchisq, 1, lower.tail=FALSE)

pchisq_table[,colnum] = tmp_pchisq

#print(tmp_pchisq)
}

names(pchisq_table) = names(lnl_matrix)
row.names(pchisq_table) = row.names(lnl_matrix)[2:(nrow(lnl_matrix)-1)]

print(pchisq_table, digits=2)

write_table_good(pchisq_table, "pchisq_table.txt")

# 11 / 120 times there is a model significantly better than BM
pchisq_table < 0.05
sum(pchisq_table < 0.05)

# 5/ 24 params reject BM significantly:
# sf2id, tf2ib1, tf2ib2, kr, turn
BM_rejected = apply(pchisq_table < 0.05, 2, sum)

# All false with Bonferroni correction, but that's probably not fair
pchisq_table < (0.05 / (nrow(pchisq_table) * ncol(pchisq_table)) )

##########################################################

#x = dtf_to_phyDat(char_dtf)
txt_suffix = ""

# ACE methods:
# a character specifying the method used for estimation. Three choices are possible: "ML", "pic", or "GLS".

# ACE with ML method
# BM = brownian motion
num_internal_nodes = chtr2\$Nnode

PICcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )

for (i in 1:ncol(char_dtf))
{
PICcont = ace(char_dtf[,i], chtr2, type="continuous", method="pic", CI=TRUE, model="BM")
PICcont_means[, i] = PICcont\$ace
PICcont_lower025[, i] = PICcont\$CI95[, 1]
PICcont_upper975[, i] = PICcont\$CI95[, 2]

}
names(PICcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(PICcont_means) = tree_dtf\$node[ (length(chtr2\$tip.label)+1): length(tree_dtf\$node)]
names(PICcont_lower025) = names(char_dtf)
row.names(PICcont_lower025) = row.names(PICcont_means)
names(PICcont_upper975) = names(char_dtf)
row.names(PICcont_upper975) = row.names(PICcont_means)

write_table_good_w_rownames(PICcont_means, "PICcont_means_v8.txt")
write_table_good_w_rownames(PICcont_lower025, "PICcont_lower025_v8.txt")
write_table_good_w_rownames(PICcont_upper975, "PICcont_upper975_v8.txt")

# plot tip & node values to check them

# plot tip & node values to check them
txt1 = "PIC"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")

tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")

plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}

MLcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
Blombergs_Kstats = c()
PIC.variance.obs = c()
PIC.variance.rnd.mean = c()
PIC.variance.P = c()
PIC.variance.Z = c()

for (i in 1:ncol(char_dtf))
{
MLcont = ace(char_dtf[,i], chtr2, type="continuous", method="ML", CI=TRUE, model="BM")
phylosignals = phylosignal(char_dtf[,i], chtr2)
Blombergs_Kstats = c(Blombergs_Kstats, phylosignals\$K)
PIC.variance.obs = c(PIC.variance.obs, phylosignals\$PIC.variance.obs)
PIC.variance.rnd.mean = c(PIC.variance.rnd.mean, phylosignals\$PIC.variance.rnd.mean)
PIC.variance.P = c(PIC.variance.P, phylosignals\$PIC.variance.P)
PIC.variance.Z = c(PIC.variance.Z, phylosignals\$PIC.variance.Z)

MLcont_means[, i] = MLcont\$ace
MLcont_lower025[, i] = MLcont\$CI95[, 1]
MLcont_upper975[, i] = MLcont\$CI95[, 2]
}
names(MLcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(MLcont_means) = tree_dtf\$node[ (length(chtr2\$tip.label)+1): length(tree_dtf\$node)]
names(MLcont_lower025) = names(char_dtf)
row.names(MLcont_lower025) = row.names(MLcont_means)
names(MLcont_upper975) = names(char_dtf)
row.names(MLcont_upper975) = row.names(MLcont_means)

# Bonferroni correction
PIC.variance.P_bonferroni = PIC.variance.P * length(Blombergs_Kstats)
phylosignal_table = cbind(Blombergs_Kstats, PIC.variance.obs, PIC.variance.rnd.mean, PIC.variance.Z, PIC.variance.P, PIC.variance.P_bonferroni)
row.names(phylosignal_table) = names(char_dtf)
(phylosignal_table)

write_table_good_w_rownames(MLcont_means, "MLcont_means_v8.txt")
write_table_good_w_rownames(MLcont_lower025, "MLcont_lower025_v8.txt")
write_table_good_w_rownames(MLcont_upper975, "MLcont_upper975_v8.txt")
write_table_good_w_rownames(phylosignal_table, "phylosignal_table_v8.txt")

# plot tip & node values to check them
txt1 = "ML"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")

tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")

plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}

GLScont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
GLScont = ace(char_dtf[,i], chtr2, type="continuous", method="GLS", CI=TRUE, corStruct = corBrownian(1, chtr2))
GLScont_means[, i] = GLScont\$ace
GLScont_lower025[, i] = GLScont\$CI95[, 1]
GLScont_upper975[, i] = GLScont\$CI95[, 2]
}
names(GLScont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(GLScont_means) = tree_dtf\$node[ (length(chtr2\$tip.label)+1): length(tree_dtf\$node)]
names(GLScont_lower025) = names(char_dtf)
row.names(GLScont_lower025) = row.names(GLScont_means)
names(GLScont_upper975) = names(char_dtf)
row.names(GLScont_upper975) = row.names(GLScont_means)

write_table_good_w_rownames(GLScont_means, "GLScont_means_v8.txt")
write_table_good_w_rownames(GLScont_lower025, "GLScont_lower025_v8.txt")
write_table_good_w_rownames(GLScont_upper975, "GLScont_upper975_v8.txt")

# write the tree out to a table for Excel
tmptree_table = prt(chtr2)
tmptree_table\$daughter_nds = as.character(tmptree_table\$daughter_nds)
#tmptree_table = tmptree_table[, !(names(tmptree_table) == "daughter_nds")]
write_table_good(tmptree_table, "chtr2_v8.txt")

# plot tip & node values to check them
txt1 = "GLS"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")

tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")

plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}

# Compare the estimates against each other with pairs plots
for (i in 1:ncol(char_dtf))
{
tmpcols = cbind(PICcont_means[,i], MLcont_means[,i], GLScont_means[,i])
names(tmpcols) = c("ACE_pic", "ACE_ML", "ACE_GLS")

pairs.panels(tmpcols)

titletxt = paste("param #", i, ": ", names(char_dtf)[i], sep="")
title(titletxt, line=-1.35, outer=TRUE)
}

#######################################################
# Reconstruct the ancestors -- discrete, plots
# discrete parameter characters
#######################################################
char_dtf = all_params[, colnames_to_use]
num_uniq = apply(char_dtf, 2, function(x) length(unique(x)))

# parsimony reconstruction
#
# https://stat.ethz.ch/pipermail/r-sig-phylo/2010-January/000565.html
# Hi Santiago,
#
# Look at the ace function in the Ape package - to get a max parsimony
# reconstruction set branch lengths =1 and run a maximum likelihood
# estimation.  You can find further information here: http://www.r-phylo.org/wiki/HowTo/Ancestral_State_Reconstruction
#

#x = dtf_to_phyDat(char_dtf)

# make a parsimony tree:
pars_tr = chtr2
# set all branch lengths to 1
pars_tr\$edge.length[!is.na(pars_tr\$edge.length)] = 1
#ace(char_dtf[,1], pars_tr, type="discrete", method="ML", model="ER")

par(mfrow=c(1,2))
ACE_pars_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
(ACE_pars = ace(char_dtf[,i], pars_tr, type="discrete", method="ML", model="ER", ip=0.3))

# convert to parsimony
maxpar_node_vals = ACE_pars\$lik.anc
maxpar_node_maxes = apply(maxpar_node_vals, 1, max)

maxpar_node_MP = array(NA, dim=c(num_internal_nodes, 1))
for (j in 1:nrow(maxpar_node_vals))
{
tmpTFs = maxpar_node_vals[j,] == maxpar_node_maxes[j]
names(tmpTFs)
colnum = which(tmpTFs)
if (length(colnum) > 1)
{
tmplist = c()
for (k in 1:length(colnum))
{
tmplist[k] = names(maxpar_node_vals)[colnum[k]]
}
tmpstr = paste(tmplist, collapse="&")
maxpar_node_MP[j] = tmpstr
} else {
maxpar_node_MP[j] = names(maxpar_node_vals)[colnum]
}
}

ACE_pars_estimates[, i] = maxpar_node_MP

# plot results
tree_ht = get_max_height_tree(chtr2)
label_offset = 0.05*tree_ht
plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)

if (num_uniq[i] == 2)
{
colors = c("yellow", "blue")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors
tmpcols[tmpcols == 1] = colors
}
if (num_uniq[i] == 3)
{
colors = c("yellow", "blue", "red")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors
tmpcols[tmpcols == 1] = colors
tmpcols[tmpcols == 2] = colors
}

tiplabels(pch = 19, col = tmpcols, cex = 2)  #adj = 0,
legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)

nodelabels(thermo=ACE_pars\$lik.anc, cex=1, piecol=colors)

titletxt = paste("ML, parsimony-like reconstruction of\n", names(char_dtf)[i], " character", sep="")
title(titletxt, cex=0.5)

}

names(ACE_pars_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_pars_estimates) = tree_dtf\$node[ (length(pars_tr\$tip.label)+1): length(pars_tr_dtf\$node)]
#row.names(ACE_pars_estimates) = row.names(char_dtf)
(ACE_pars_estimates)

write_table_good(ACE_pars_estimates, "ACE_pars_estimates_v8.txt")

# Do ML reconstruction, with varying branch lengths (i.e. not branches = 1 like "parsimony")
par(mfrow=c(1,2))
ACE_ML_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
(ACE_ML = ace(char_dtf[,i], chtr2, type="discrete", method="ML", model="ER", ip=0.3))

# convert to the single best ML estimate (plurality vote)
maxML_node_vals = ACE_ML\$lik.anc
maxML_node_maxes = apply(maxML_node_vals, 1, max)

maxML_node_best = array(NA, dim=c(num_internal_nodes, 1))
for (j in 1:nrow(maxML_node_vals))
{
tmpTFs = round(maxML_node_vals[j,], 3) == round(maxML_node_maxes[j], 3)
names(tmpTFs)
colnum = which(tmpTFs)
if (length(colnum) > 1)
{
tmplist = c()
for (k in 1:length(colnum))
{
tmplist[k] = names(maxML_node_vals)[colnum[k]]
}
tmpstr = paste(tmplist, collapse="&")
maxML_node_best[j] = tmpstr
} else {
maxML_node_best[j] = names(maxML_node_vals)[colnum]
}
}

#cbind(maxML_node_vals, maxML_node_maxes, maxML_node_best)

ACE_ML_estimates[, i] = maxML_node_best

# plot results
tree_ht = get_max_height_tree(chtr2)
label_offset = 0.05*tree_ht
plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)

if (num_uniq[i] == 2)
{
colors = c("yellow", "blue")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors
tmpcols[tmpcols == 1] = colors
}
if (num_uniq[i] == 3)
{
colors = c("yellow", "blue", "red")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors
tmpcols[tmpcols == 1] = colors
tmpcols[tmpcols == 2] = colors
}
tiplabels(pch = 19, col = tmpcols, cex = 2)  #adj = 0,
legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)

nodelabels(thermo=ACE_ML\$lik.anc, cex=1, piecol=colors)

titletxt = paste("ML, equal rates reconstruction of\n", names(char_dtf)[i], " character", sep="")
title(titletxt, cex=0.5)
}

names(ACE_ML_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_ML_estimates) = tree_dtf\$node[ (length(pars_tr\$tip.label)+1): length(pars_tr_dtf\$node)]
#row.names(ACE_ML_estimates) = row.names(char_dtf)
(ACE_ML_estimates)

write_table_good(ACE_ML_estimates, "ACE_ML_estimates_v8.txt")

# Write out the trees we generates, also...
write.tree(tr, "tr.newick")
write.tree(tr2, "tr2.newick")
write.tree(chtr2, "chtr2.newick")```
```