Tools For Evo Devo
##############################################################
# =========================================================
# An Evo-Devo Analysis in R: an example with cone snails
# =========================================================
# by Nick Matzke (and whoever else adds to this PhyloWiki page)
# Copyright 2011-infinity
# matzkeATberkeley.edu
# March 2011
#
# Please link/cite if you use this, email me if you have
# thoughts/improvements/corrections.
#
##############################################################
#
# Copyright: this lab contains/uses unpublished research data &
# analyses.
#
# IB200b students are of course free to re-use the code, others should
# contact me about re-use of the code or data.
#
# This work was done in collaboration with Monty Slatkin, George
# Oster, and Neil Gong.
#
###################################################################
#
# Assignment for Tuesday, March 8
#
# In this lab, I will take you through a real analysis I am
# working on that is basically applying ancestral character
# reconstruction methods and model selection to an evo-devo
# question.
#
# George Oster's lab (Oster is a biophysicist) developed a shell-
# growth model for cone snails that runs in Matlab. (Google cone
# snails to look at some shells.)
#
# Basically, given a series of 24 input parameters, the model grows
# a shell, complete with the pattern, which might be stripes, dots,
# triangles, blank, etc. The first 19 parameters control various
# signal and response curves that cells of the cone snail mantle,
# under the model issue and receive, which then control the release
# of pigment.
#
# The last 5 parameters deal with shell shape:
#
# kz, kr, and turn are from Raup's famous shell-shape model, and
# control the # of turns, the angle, etc. of the shell
#
# N and T are starting conditions for the model (# of cells at the
# growing edge, and the time point they start at).
#
#
#
# Once they built the model, Oster's lab then took a bunch of living
# species, and figured out the parameters that grow patterns to match
# each of these species.
#
# Oster then came over to Integrative Biology to "do some evolution
# with this". Some obvious questions are:
#
# 1. Is there any phylogenetic signal in these parameters, which
# after all are not direct measurements, but parameters that were
# fit to a computer model by eye.
#
# 1a. Are the parameters statistically independent, or highly
# correlated with each other?
#
# 2. Is Brownian motion a reasonable model under which to
# conduct ancestral state reconstruction?
#
# 3. What model of evolution best describes each parameter?
# Available models are:
# - Brownian motion (BM);
#
# - the Ornstein−Uhlenbeck (OU) stabilizing selection model;
#
# - the lambda model which rescales internal branch lengths by
# a linear fraction;
#
# - the kappa model (green) which rescales each branch length by a power
# equal to the kappa parameter, and which becomes a speciational model
# as kappa approaches 0;
#
# - delta (yellow) which focuses change towards the base or tips;
#
# - early burst (EB, cyan) which has an initial high rate of change that
# then declines;
#
# - and white noise (white), where observations are produced by a normal
# distribution with no tree structure, which represents the situation
# of no phylogenetic signal.
#
# See references under the function fitContinuous.
#
# Assignment:
#
# Work through this code, and try to understand what each
# stage does.
#
# Have a look at the final Likelihood Ratio Test results, and the model
# comparison figure which uses AIC, and email me your verbal summary of
# those results. E.g., which model is best, how confidence should the
# reader be in those conclusions, and what are the implications in
# terms of understanding the evolutionary processes involved,
# and the likely accuracy of using Brownian motion for ancestral
# character reconstruction.
#
# Files to download:
#
# Start by looking at the graphical file Neil_tree.png on the website.
# This has photos of the cone snail species, and the matching shells
# grown in the computer, on a DNA phylogeny.
#
# The parameters are stored in parameters_njm7.txt, which this script
# takes as input, along with the DNA tree, tr.newick, and the
# ultrametricized version, tr2.newick.
#
# Just to cut to the end, The final results figure is at:
# tree_DNA_num_ML_inv2.png
#
###################################################################
# Stuff that is handy at the beginning of a script
######################################################
# R, close all devices/windows/ quartz windows
graphics.off()
# Turn off all BS stringsAsFactors silliness
# R tends to read
options(stringsAsFactors = FALSE)
######################################################
# Working directories:
# One of the first things you want to do, usually, is
# decide on your working directory
# On my Mac, this is the directory
wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab10"
# On a PC, you might have to specify paths like this:
#wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab10"
#desc: set working directory
setwd(wd)
#desc: get working directory
getwd()
#desc: list the files in the directory
list.files()
# Sourcing scripts
# Local sourcing:
# source a script (windows machines require \\ instead of /)
# sourcedir = '/_njm/'
#source3 = '_genericR_v1.R'
#source(paste(sourcedir, source3, sep=""))
# You need the APE package to source genericR_v1.R scripts
install.packages("ape")
library(ape)
# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_conus_analyses_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R")
# Setup:
# install these, if needed
library(ape)
library(phangorn) # helps with "midpoint" function
library(picante) # for "Kcalc" and "phylosignal", i.e. calculation of the kappa statistic for continuous parameters
library(geiger) # for e.g. fitContinuous
#########################################
# Plot the original DNA tree
#########################################
trfn = "tr.newick"
tr = read.tree(trfn)
plot(tr)
title("Original DNA tree (digitized from Nam et al. 2009\nusing TreeRogue 0.1)\n(Note: axis scale is in digitization coordinates)")
axisPhylo()
add.scale.bar(x=0, y=23, length=1)
# Get the parameters file
params_orig = read_table_good("parameters_njm7.txt")
# identify the types of params
uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia", "i1s1g", "i1s2eg", "i1s2ig", "i1s3g", "i1sf2ia", "i2s1g", "i2s2eg", "i2s2ig", "i2s3g", "i2sf2ia")
continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
shell_params_names = c("kz", "kr", "turn")
starting_params_names = c("N", "T")
complex_params_names = c("s3n_std", "s3t_spd")
clad_chars_names = c("stripes", "triangles", "dots", "color", "conical", "food")
complex1_names = names(params_orig)[(ncol(params_orig)-25*2) : (ncol(params_orig)-26)]
complex2_names = names(params_orig)[(ncol(params_orig)-25) : ncol(params_orig)]
nonsucky_names = c(continuous_params_names, shell_params_names, starting_params_names, complex_params_names, clad_chars_names)
all_cont_names = c(continuous_params_names, shell_params_names, starting_params_names)
# remove the uniform params
cols_tokeep = names(params_orig) %in% uniform1_params_names == FALSE
all_params = params_orig[, cols_tokeep]
row.names(all_params) = all_params$species
(all_params)
###################################
# Drop the tips we're not using
###################################
all_params_orig = all_params
#n1 = xy3$nodenames[xy3$nodenames != ""]
n1 = tr$tip.label
n2 = all_params$species
n1_to_keep = n1 %in% n2
print(cbind(n1, n1_to_keep))
# drop the tips we're not using
tr2 = drop.tip(tr, n1[n1_to_keep == FALSE])
####################################################################
# reorder the parameters to match the order of tips in the tree
# (this is CRUCIAL!!)
####################################################################
# tree tip order, from bottom to top:
(labels1 = tr2$tip.label)
# order of rows in char_dtf
(labels2 = row.names(all_params))
# reorder char_dtf to match tip.labels
new_order = match(labels1, labels2)
all_params = all_params_orig[new_order, ]
params = all_params
# Cut params down to just continuous parameters for correlation analysis
params = params[, all_cont_names]
####################################
#pairs(params[, -1])
cps = cor(params)
cps = as.data.frame(cps)
names_cps = names(cps)
#dim(params[, -1])
# Rank pairwise correlations
pairwise_cors_list = c()
for (i in 2:nrow(cps))
{
if ((i+1) < ncol(cps))
{
for (j in (i+1):ncol(cps))
{
tmprow = c(names_cps[i], names_cps[j], cps[i,j], abs(cps[i, j]))
pairwise_cors_list = rbind(pairwise_cors_list, tmprow)
}
}
}
row.names(pairwise_cors_list) = NULL
pairwise_cors_list = as.data.frame(pairwise_cors_list)
names(pairwise_cors_list) = c("x", "y", "cor", "abs_cor")
#rank(pairwise_cors_list$abs_cor)
order_cors = order(pairwise_cors_list$abs_cor, decreasing=TRUE)
pairwise_cors_list = pairwise_cors_list[order_cors, ]
print("Top correlations:")
(pairwise_cors_list[1:20, ])
######################################################
# Phylogenetically Independent Contrasts
######################################################
# we are going to use an ultrametric tree:
chtr2 = read.tree("chtr2.newick")
#chronogram(tr2, scale = 1, expo = 2, minEdgeLength = 1e-06)
write.tree(chtr2, file="chtr2.newick")
plot(chtr2)
title("DNA tree, ultrametricized with APE's chronogram() function: NPRS\n(nonparametric rate smoothing), basically a least squares fit after Sanderson 1997")
axisPhylo()
add.scale.bar(x=0, y=0.9, length=0.1)
pic_params = c()
for (i in 1:ncol(params))
{
x = pic(params[,i], chtr2)
pic_params = cbind(pic_params, x)
}
pic_params = as.data.frame(pic_params)
names(pic_params) = names(params)
#pairs(pic_params)
cor(pic_params)
# Plot all params vs. all params
tmpparams = params[, all_cont_names]
tmpparams = dfnums_to_numeric(tmpparams)
tmpparams[tmpparams == "?"] = NA
pairs.panels(tmpparams, smooth=FALSE)
title("Regular correlations")
pairs.panels(pic_params, smooth=FALSE)
title("PIC correlations")
# Assemble the tables of correlations for params and PICs of params
res1_txt_table = c()
respic_txt_table = c()
for (i in 1:ncol(params))
{
for (j in i:ncol(params))
{
if (i == j)
{
next()
}
cat("x=", names(params)[i], " vs. y=", names(params)[j], "\n", sep="")
res1 = cor.test(params[, i], params[, j])
respic = cor.test(pic_params[, i], pic_params[, j])
#print()
#print()
lm_res1 = lm(params[, j] ~ params[, i])
lm_respic = lm(pic_params[, j] ~ pic_params[, i])
res1_txt = c(names(params)[i], names(params)[j], res1$statistic, res1$parameter, res1$p.value, res1$estimate, res1$null.value, res1$alternative, res1$method, res1$data.name, res1$conf.int[1], res1$conf.int[2], lm_res1$coefficients[2], lm_res1$coefficients[1])
respic_txt = c(names(params)[i], names(params)[j], respic$statistic, respic$parameter, respic$p.value, respic$estimate, respic$null.value, respic$alternative, respic$method, respic$data.name, respic$conf.int[1], respic$conf.int[2], lm_respic$coefficients[2], lm_respic$coefficients[1])
res1_txt_table = rbind(res1_txt_table, res1_txt)
respic_txt_table = rbind(respic_txt_table, respic_txt)
}
}
res1_txt_table = as.data.frame(res1_txt_table, row.names=1:nrow(res1_txt_table))
names(res1_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))
respic_txt_table = as.data.frame(respic_txt_table, row.names=1:nrow(respic_txt_table))
names(respic_txt_table) = (c("x", "y", "statistic_t", "parameter_df", "pval", "cor_estimate", "null", "alternative", "method", "data.name", "conf.int1", "conf.int2", "slope", "intercept"))
res1_txt_table = dfnums_to_numeric(res1_txt_table)
respic_txt_table = dfnums_to_numeric(respic_txt_table)
res1_txt_table$pval_bonf = res1_txt_table$pval * nrow(res1_txt_table)
respic_txt_table$pval_bonf = respic_txt_table$pval * nrow(respic_txt_table)
res1_txt_table = res1_txt_table[order(respic_txt_table$pval_bonf),]
respic_txt_table = respic_txt_table[order(respic_txt_table$pval_bonf),]
(res1_txt_table[1:20,])
(respic_txt_table[1:20,])
# sort by ascending p-value
res1_txt_table_sort = res1_txt_table[order(res1_txt_table$pval), ]
respic_txt_table_sort = respic_txt_table[order(respic_txt_table$pval), ]
(res1_txt_table_sort[1:20,])
(respic_txt_table_sort[1:20,])
x = res1_txt_table$cor_estimate
y = respic_txt_table$cor_estimate
xlab = "r (regular)"
ylab="r (PIC)"
xlim = c(-1, 1)
ylim = c(-1, 1)
titlestart_txt = "correlation (r) estimates for pairwise correlations,\nnormal vs. phylogenetically independent contrasts;\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)
x = res1_txt_table$pval
y = respic_txt_table$pval
xlab="p-val (regular)"
ylab="p-val (PIC)"
xlim = c(0, 1)
ylim = c(0, 1)
titlestart_txt = "p-values for pairwise correlations,\nnormal vs. phylogenetically independent contrasts\n"
basic_xy_plot(x, y, titlestart_txt, xlab, ylab, xlim, ylim)
# Write the tables to text
write.table(res1_txt_table, "res1_txt_table.txt")
write.table(respic_txt_table, "respic_txt_table.txt")
############################################################
#####################################################
# Assess the parsimony stats of the discrete characters
#####################################################
colnames_to_use = c(complex_params_names, clad_chars_names)
char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig
# Make the final figure comparing the trees
# quite fast
#(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
(parstats = parsim_stats_fast(tr2, dfnums_to_numeric(char_dtf)))
write_table_good(parstats, "parstats_v8.txt")
# slower
#(parstats = parsim_stats(tr2, char_dtf))
# Now, resample (takes a few seconds)
num_resamps = 100
parstats_null = array(data=NA, dim=c(ncol(char_dtf)+1, 6, num_resamps+1))
for (i in 1:num_resamps)
{
tmpsamp = resample_dtf_matrix(char_dtf)
parstats_null[, , i] = as.matrix(parsim_stats_fast(tr2, tmpsamp))
}
parstats_null[, , num_resamps+1] = as.matrix(parstats)
summary_parstats_means = parstats
summary_parstats_sds = parstats
summary_parstats_pvals = parstats
# do histograms of the six different distances
# set outer margins & number of subplots
par(oma=c(1,1,5,1), mfrow=c(2,3))
for (i in 1:nrow(parstats))
{
for (j in 1:ncol(parstats))
{
x = parstats_null[i, j, ]
summary_parstats_means[i, j] = mean(x)
summary_parstats_sds[i, j] = sd(x)
observed_val = parstats_null[i, j, num_resamps+1]
pval = round(empirical_pval_fast(x, observed_val), 4)
# redo pval depending on whether value should be low or high
if (j > 3)
{
pval = round(1-pval, 4)
}
summary_parstats_pvals[i, j] = pval
tmp_title = paste("non-parametric p-value = ", pval, sep="")
h = hist(x, main=tmp_title, breaks=50, cex.main=1, xlab=names(parstats)[j])
# add the arrow
arrowtop = 0.3*max(h$counts, na.rm=TRUE)
arrowbot = 0.03*max(h$counts, na.rm=TRUE)
arrows(observed_val, arrowtop, observed_val, arrowbot, col="blue", lwd=4)
}
titletxt = paste("Parsimony stats for ", row.names(parstats)[i], sep="")
mtext(titletxt, outer=TRUE)
}
write_table_good(summary_parstats_means, "parstats_means_v8.txt")
write_table_good(summary_parstats_sds, "parstats_sds_v8.txt")
write_table_good(summary_parstats_pvals, "parstats_pvals_v8.txt")
#######################################################
# Ancestral character reconstruction -- continuous, plots
#######################################################
par(mfrow=c(1,2))
# character mapping
# uniform1_params_names = c("s1g", "s2eg", "s2ig", "s3g", "sf2ia")
# continuous_params_names = c("s1n", "s1t", "s2en", "s2et", "s2in", "s2it", "s3n", "s3t", "sf2ea", "sf2ed", "sf2id", "tf2eb1", "tf2ec1", "tf2eb2", "tf2ec2", "tf2ib1", "tf2ic1", "tf2ib2", "tf2ic2")
# shell_params_names = c("kz", "kr", "turn")
# starting_params_names = c("N", "T")
# complex_params_names = c("s3n_std", "s3t_spd")
# clad_chars_names
# continuous parameter characters
colnames_to_use = c(continuous_params_names, shell_params_names, starting_params_names)
char_dtf_orig = all_params[, colnames_to_use]
char_dtf = char_dtf_orig
#################
# reorder char_dtf
################
# tree tip order, from bottom to top:
#(labels1 = chtr2$tip.label)
# order of rows in char_dtf
#(labels2 = row.names(char_dtf))
# reorder char_dtf to match tip.labels
#new_order = match(labels1, labels2)
#char_dtf = char_dtf[new_order, ]
par(mfrow=c(6,4))
for (colnum in 1:ncol(char_dtf))
{
hist(as.numeric(char_dtf[,colnum]), 17, main=title(names(char_dtf)[colnum]) )
}
par(mfrow=c(1,1))
###########################################################################
# Compare different models for the evolution of this character
###########################################################################
runthis4 = TRUE
if (runthis4 == TRUE)
{
# models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white", "trend")
# cut "trend" model, doesn't matter for ultrametric trees (same as BM model, evidently)
models_list = c("BM", "OU", "lambda", "kappa", "delta", "EB", "white")
model_fits = NULL
# GET THE VARIOUS PARAMETERS ALSO
# beta (typically the rate)
# alpha (central tendency on BM walk that is proportional to alpha)
# Lambda model
# multiplies all internal branches of the tree by lambda, leaving tip
# branches as their original length
#
# kappa raises all branch lengths to the power kappa. As kappa approaches
# zero, the model becomes speciational.
#
# (apparently, kappa is saved as "lamda" in the kappa model)
# delta (delta>1, evo tipwards, < 1, evo at base)
# EB (a = proportion of initial rate represented by the end rate (?))
# White Noise (draw from mean/variance)
# mean = mean
# nv = variance (?)
# THESE BOUNDS PRODUCE PATHOLOGICAL RESULTS (on my first dataset)
bounds_list = NULL
# bounds_list$BM = list(beta=c(1,100))
# bounds_list$OU = list(alpha=c(0.0001, 10), beta=c(1, 100))
# bounds_list$lambda = list(beta=c(1,100))
# bounds_list$kappa = list(beta=c(1,100))
# bounds_list$EB = list(beta=c(1,100))
# bounds_list$white = NULL
# THESE BOUNDS ARE THE DEFAULTS
bounds_list = NULL
# db = default bounds
#db_beta = c(1e-08, 20)
#db_lambda = c(1e-07, 1)
#db_kappa = c(1e-06, 1)
#db_delta = c(1e-05, 2.999999)
#db_alpha = c(1e-07, 50)
#db_a = c(-3, 0)
#db_nv = c(1e-10, 100)
#db_mu = c(-100, 100)
# YOU SHOULD ALWAYS CHECK IF YOUR ESTIMATES HIT THE BOUNDS, AND
# THEN RE-ADJUST
bounds_list = NULL
# db = default bounds
db_beta = c(1e-08, 1e08)
db_alpha = c(1e-07, 50)
db_lambda = c(1e-07, 1)
db_kappa = c(1e-06, 1)
db_delta = c(1e-05, 2.9999)
db_a = c(-3, 0)
db_nv = c(1e-10, 1000000)
#db_mu = c(-100, 100)
bounds_list$BM = list(beta=db_beta)
bounds_list$OU = list(beta=db_beta, alpha=db_alpha)
bounds_list$lambda = list(beta=db_beta, lambda=db_lambda)
bounds_list$kappa = list(beta=db_beta, lambda=db_kappa)
bounds_list$EB = list(beta=db_beta, a=db_a)
bounds_list$white = list(nv=db_nv)
# bounds_list$model = list(mu=db_mu)
for (i in 1:length(models_list))
{
# Chose the right bounds
cmdstr = paste("tmp_bounds = bounds_list$", models_list[i], sep="")
eval(parse(text = cmdstr))
model_fit = fitContinuous(chtr2, char_dtf, model=models_list[i], bounds = tmp_bounds)
cmdstr = paste("model_fits$", models_list[i], " = model_fit", sep="")
eval(parse(text = cmdstr))
}
# Save R object to a file
save(model_fits, file="model_fits.Rdata")
} else {
load(file="model_fits.Rdata")
}
model_fits_orig = model_fits
model_fits = model_fits_orig
# Get log likelihoods and AICs
results_list = c("lnl", "aic", "aicc", "k", "beta", "alpha", "lambda", "delta", "a", "mean", "nv")
# fill in blank results
for (item in results_list)
{
cmdstr = paste(item, "_matrix = matrix(NA, nrow=length(models_list), ncol=length(all_cont_names))", sep="")
eval(parse(text = cmdstr))
}
# put in the data from the model results
for (i in 1:length(models_list))
{
for (j in 1:length(all_cont_names))
{
for (item in results_list)
{
cmdstr = paste("tmp_item = model_fits$", models_list[i], "$", all_cont_names[j], "$", item, sep="")
eval(parse(text = cmdstr))
if (is.null(tmp_item))
{
cmdstr = paste(item, "_matrix[i,j] = NA", sep="")
eval(parse(text = cmdstr))
} else {
cmdstr = paste(item, "_matrix[i,j] = tmp_item", sep="")
eval(parse(text = cmdstr))
}
}
}
}
# Apply the row & column names
for (item in results_list)
{
cmdstr = paste(item, "_matrix = adf(", item, "_matrix)", sep="")
eval(parse(text = cmdstr))
cmdstr = paste("names(", item, "_matrix) = names(char_dtf)", sep="")
eval(parse(text = cmdstr))
cmdstr = paste("row.names(", item, "_matrix) = models_list", sep="")
eval(parse(text = cmdstr))
# write to txt file
cmdstr = paste("write_table_good(", item, "_matrix, '", item, "_matrix.txt')", sep="")
eval(parse(text = cmdstr))
}
lnl_matrix
aic_matrix
aicc_matrix
k_matrix
beta_matrix
alpha_matrix
lambda_matrix
delta_matrix
a_matrix
mean_matrix
nv_matrix
# Compare the models using AIC
# Another way: use AIC
# For each col, subtract the min
diff_aic_matrix = aic_matrix
for (colnum in 1:ncol(aic_matrix))
{
diff_aic_matrix[, colnum] = aic_matrix[, colnum] - min(aic_matrix[, colnum], na.rm=TRUE)
}
cat("Table of difference-from-best-AIC values; 0 = best model\n")
print(diff_aic_matrix, digits=2)
# Citation for this calculation:
# http://www.brianomeara.info/tutorials/aic
#
# or:
#
#
aic_wi = exp(1) ^ (-0.5 * diff_aic_matrix)
colsum_aic_wi = apply(aic_wi, 2, sum, na.rm=TRUE)
# calculate the relative weights
aic_weights = aic_wi
for (colnum in 1:ncol(aic_wi))
{
aic_weights[,colnum] = aic_wi[,colnum] / colsum_aic_wi[colnum]
}
#
cat("These are the AIC weights, calculated according to Burnham & Anderson (2002).\n")
print(aic_weights, digits=2)
# flip aic_weights for graphing
aic_weights[nrow(aic_weights):1, ] = aic_weights
row.names(aic_weights) = row.names(aic_weights)[nrow(aic_weights):1]
#####################################
# Plot AIC weights, with labels
#####################################
# What are the best models under AIC
best_model_list = rownames(aic_weights)
best_models = rep(NA, length(colnames(aic_weights)))
for (colnum in 1:length(colnames(aic_weights)))
{
TF_row = aic_weights[,colnum] == max(aic_weights[,colnum])
best_models[colnum] = best_model_list[TF_row]
}
print(best_models)
# Get results
#pdf(file = "conus_cors_all_vs_all_v3.pdf", width=15, height=15)
doPDFs2 = TRUE
if (doPDFs2 == TRUE)
{
pdffn = "Figure_SX_model_compare_v8.pdf"
pdf(file=pdffn, width=8, height=10, paper="USr")
}
par(oma=c(1,1,1,1), mar=c(15.6, 4.1, 4.1, 7.1), mfrow = c(1,1))
par(xpd=TRUE)
tmp_colors = c("white", "cyan", "yellow", "green2", "orange", "red", "brown")
tmp_xnames = rep("", length(colnames(aic_weights)))
legend_text = rev(c("Brownian", "O-U", "lambda", "kappa", "delta", "early burst", "white noise"))
tmp_legend = list(x=38, y=0.9, pt.cex=4)
# don't plot y-axis here
par(yaxt="n")
bar_x_positions = barplot(aic_weights, names.arg=tmp_xnames, col=tmp_colors, ylab="Proportion of AIC weight", xlab="shell growth parameter", legend.text=legend_text, args.legend=tmp_legend)
# plot y-axis here
par(yaxt="s")
ticks = c(0, 0.25, 0.5, 0.75, 1)
axis(side=2, at=ticks, line=0, labels=ticks, lty = "solid", lwd = 1)
## Create plot and get bar midpoints in 'mp'
#bar_x_positions = barplot(1:length(colnames(aic_weights)))
## Set up x axis with tick marks alone
#axis(1, at = bar_x_positions, labels = FALSE)
# Create some text labels
tmp_x_labels = colnames(aic_weights)
# Plot x axis labels at mp
text(x=bar_x_positions-0.1, y=-0.015, adj=0, srt=315, labels = tmp_x_labels, xpd=TRUE)
# Plot the best models across the top
text(x=bar_x_positions-0.1, y=1.015, adj=0, srt=45, labels = best_models, xpd=TRUE)
toptxt = "(Best model under AIC)"
text(x=15, y=1.14, labels=toptxt)
caption = "Figure SX: Akaike weights of 7 models for the evolution of shell parameters. The models \ncompared are: Brownian motion (BM, brown); the Ornstein-Uhlenbeck (OU, red) \nstabilizing selection model; the lambda model (orange) which rescales internal branch \nlengths by a linear fraction; the kappa model (green) which rescales each branch length \nby a power equal to the kappa parameter, and which becomes a speciational model as \nkappa approaches 0; delta (yellow) which focuses change towards the base or tips; \nearly burst (EB, cyan) which has an initial high rate of change that then declines; and white \nnoise (white), where observations are produced by a normal distribution with no tree \nstructure, which represents the situation of no phylogenetic signal. Brownian motion (BM) \nhas the highest AIC weight for 63% of the shell parameters (15/24). White noise \n(no phylogenetic signal) is superior for 25% (6/24) shell parameters."
text(x=-5.4, y=-0.56, labels=caption, adj=0, xpd=TRUE)
##########################################################
# Turn off the PDF and open
if (doPDFs2 == TRUE)
{
dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)
}
##########################################################
# Do LRT on the different models
##########################################################
# Compare all models to BM
pchisq_table = matrix(NA, nrow=5, ncol=ncol(lnl_matrix))
for (colnum in 1:ncol(lnl_matrix))
{
tmp_lnls = lnl_matrix[, colnum]
# Calculate the log-likelihood differences from
# the Brownian motion model
#
# You can't compare white noise, since it's not nested
tmp_lnls_diffs = tmp_lnls[2:(length(tmp_lnls)-1)] - tmp_lnls[1]
# take twice the difference
tmp_lnls_twice_diffs = 2 * tmp_lnls_diffs
dim(tmp_lnls_twice_diffs) = c(1, length(tmp_lnls_twice_diffs))
# Do chi-squared test with 1 degree of freedom
tmp_pchisq = apply(tmp_lnls_twice_diffs, 1, pchisq, 1, lower.tail=FALSE)
pchisq_table[,colnum] = tmp_pchisq
#print(tmp_pchisq)
}
pchisq_table = adf(pchisq_table)
names(pchisq_table) = names(lnl_matrix)
row.names(pchisq_table) = row.names(lnl_matrix)[2:(nrow(lnl_matrix)-1)]
print(pchisq_table, digits=2)
write_table_good(pchisq_table, "pchisq_table.txt")
# 11 / 120 times there is a model significantly better than BM
pchisq_table < 0.05
sum(pchisq_table < 0.05)
# 5/ 24 params reject BM significantly:
# sf2id, tf2ib1, tf2ib2, kr, turn
BM_rejected = apply(pchisq_table < 0.05, 2, sum)
# All false with Bonferroni correction, but that's probably not fair
pchisq_table < (0.05 / (nrow(pchisq_table) * ncol(pchisq_table)) )
##########################################################
#x = dtf_to_phyDat(char_dtf)
txt_suffix = ""
# ACE methods:
# a character specifying the method used for estimation. Three choices are possible: "ML", "pic", or "GLS".
# ACE with ML method
# BM = brownian motion
num_internal_nodes = chtr2$Nnode
PICcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
PICcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
PICcont = ace(char_dtf[,i], chtr2, type="continuous", method="pic", CI=TRUE, model="BM")
PICcont_means[, i] = PICcont$ace
PICcont_lower025[, i] = PICcont$CI95[, 1]
PICcont_upper975[, i] = PICcont$CI95[, 2]
}
PICcont_means = adf(PICcont_means)
names(PICcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(PICcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
PICcont_lower025 = adf(PICcont_lower025)
names(PICcont_lower025) = names(char_dtf)
row.names(PICcont_lower025) = row.names(PICcont_means)
PICcont_upper975 = adf(PICcont_upper975)
names(PICcont_upper975) = names(char_dtf)
row.names(PICcont_upper975) = row.names(PICcont_means)
write_table_good_w_rownames(PICcont_means, "PICcont_means_v8.txt")
write_table_good_w_rownames(PICcont_lower025, "PICcont_lower025_v8.txt")
write_table_good_w_rownames(PICcont_upper975, "PICcont_upper975_v8.txt")
# plot tip & node values to check them
# plot tip & node values to check them
txt1 = "PIC"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")
plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}
MLcont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
MLcont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
Blombergs_Kstats = c()
PIC.variance.obs = c()
PIC.variance.rnd.mean = c()
PIC.variance.P = c()
PIC.variance.Z = c()
for (i in 1:ncol(char_dtf))
{
MLcont = ace(char_dtf[,i], chtr2, type="continuous", method="ML", CI=TRUE, model="BM")
phylosignals = phylosignal(char_dtf[,i], chtr2)
Blombergs_Kstats = c(Blombergs_Kstats, phylosignals$K)
PIC.variance.obs = c(PIC.variance.obs, phylosignals$PIC.variance.obs)
PIC.variance.rnd.mean = c(PIC.variance.rnd.mean, phylosignals$PIC.variance.rnd.mean)
PIC.variance.P = c(PIC.variance.P, phylosignals$PIC.variance.P)
PIC.variance.Z = c(PIC.variance.Z, phylosignals$PIC.variance.Z)
MLcont_means[, i] = MLcont$ace
MLcont_lower025[, i] = MLcont$CI95[, 1]
MLcont_upper975[, i] = MLcont$CI95[, 2]
}
MLcont_means = adf(MLcont_means)
names(MLcont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(MLcont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
MLcont_lower025 = adf(MLcont_lower025)
names(MLcont_lower025) = names(char_dtf)
row.names(MLcont_lower025) = row.names(MLcont_means)
MLcont_upper975 = adf(MLcont_upper975)
names(MLcont_upper975) = names(char_dtf)
row.names(MLcont_upper975) = row.names(MLcont_means)
# Bonferroni correction
PIC.variance.P_bonferroni = PIC.variance.P * length(Blombergs_Kstats)
phylosignal_table = cbind(Blombergs_Kstats, PIC.variance.obs, PIC.variance.rnd.mean, PIC.variance.Z, PIC.variance.P, PIC.variance.P_bonferroni)
row.names(phylosignal_table) = names(char_dtf)
(phylosignal_table)
write_table_good_w_rownames(MLcont_means, "MLcont_means_v8.txt")
write_table_good_w_rownames(MLcont_lower025, "MLcont_lower025_v8.txt")
write_table_good_w_rownames(MLcont_upper975, "MLcont_upper975_v8.txt")
write_table_good_w_rownames(phylosignal_table, "phylosignal_table_v8.txt")
# plot tip & node values to check them
txt1 = "ML"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")
plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}
GLScont_means = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_lower025 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
GLScont_upper975 = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
GLScont = ace(char_dtf[,i], chtr2, type="continuous", method="GLS", CI=TRUE, corStruct = corBrownian(1, chtr2))
GLScont_means[, i] = GLScont$ace
GLScont_lower025[, i] = GLScont$CI95[, 1]
GLScont_upper975[, i] = GLScont$CI95[, 2]
}
GLScont_means = adf(GLScont_means)
names(GLScont_means) = names(char_dtf)
tree_dtf = prt(chtr2)
row.names(GLScont_means) = tree_dtf$node[ (length(chtr2$tip.label)+1): length(tree_dtf$node)]
GLScont_lower025 = adf(GLScont_lower025)
names(GLScont_lower025) = names(char_dtf)
row.names(GLScont_lower025) = row.names(GLScont_means)
GLScont_upper975 = adf(GLScont_upper975)
names(GLScont_upper975) = names(char_dtf)
row.names(GLScont_upper975) = row.names(GLScont_means)
write_table_good_w_rownames(GLScont_means, "GLScont_means_v8.txt")
write_table_good_w_rownames(GLScont_lower025, "GLScont_lower025_v8.txt")
write_table_good_w_rownames(GLScont_upper975, "GLScont_upper975_v8.txt")
# write the tree out to a table for Excel
tmptree_table = prt(chtr2)
tmptree_table$daughter_nds = as.character(tmptree_table$daughter_nds)
#tmptree_table = tmptree_table[, !(names(tmptree_table) == "daughter_nds")]
write_table_good(tmptree_table, "chtr2_v8.txt")
# plot tip & node values to check them
txt1 = "GLS"
txt2 = txt_suffix
for (i in 1:ncol(char_dtf))
{
recon_txt = paste("ACE technique: ", txt1, txt2, sep="")
tmptipvals = char_dtf[,i]
tmpnodevals = PICcont_means[,i]
maxval = max(c(tmptipvals, tmpnodevals))
texttxt = paste(recon_txt, "\nParameter: ", names(char_dtf)[i], "\nMaximum value = ", maxval, sep="")
plot_cont_vals_on_tree(chtr2, tmptipvals, tmpnodevals, texttxt)
}
# Compare the estimates against each other with pairs plots
for (i in 1:ncol(char_dtf))
{
tmpcols = cbind(PICcont_means[,i], MLcont_means[,i], GLScont_means[,i])
tmpcols = adf(tmpcols)
names(tmpcols) = c("ACE_pic", "ACE_ML", "ACE_GLS")
pairs.panels(tmpcols)
titletxt = paste("param #", i, ": ", names(char_dtf)[i], sep="")
title(titletxt, line=-1.35, outer=TRUE)
}
#######################################################
# Reconstruct the ancestors -- discrete, plots
# discrete parameter characters
#######################################################
colnames_to_use = c(complex_params_names, clad_chars_names)
char_dtf = all_params[, colnames_to_use]
num_uniq = apply(char_dtf, 2, function(x) length(unique(x)))
# parsimony reconstruction
#
# https://stat.ethz.ch/pipermail/r-sig-phylo/2010-January/000565.html
# Hi Santiago,
#
# Look at the ace function in the Ape package - to get a max parsimony
# reconstruction set branch lengths =1 and run a maximum likelihood
# estimation. You can find further information here: http://www.r-phylo.org/wiki/HowTo/Ancestral_State_Reconstruction
#
#x = dtf_to_phyDat(char_dtf)
# make a parsimony tree:
pars_tr = chtr2
# set all branch lengths to 1
pars_tr$edge.length[!is.na(pars_tr$edge.length)] = 1
#ace(char_dtf[,1], pars_tr, type="discrete", method="ML", model="ER")
par(mfrow=c(1,2))
ACE_pars_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
(ACE_pars = ace(char_dtf[,i], pars_tr, type="discrete", method="ML", model="ER", ip=0.3))
# convert to parsimony
maxpar_node_vals = ACE_pars$lik.anc
maxpar_node_maxes = apply(maxpar_node_vals, 1, max)
maxpar_node_vals = adf(maxpar_node_vals)
maxpar_node_MP = array(NA, dim=c(num_internal_nodes, 1))
for (j in 1:nrow(maxpar_node_vals))
{
tmpTFs = maxpar_node_vals[j,] == maxpar_node_maxes[j]
names(tmpTFs)
colnum = which(tmpTFs)
if (length(colnum) > 1)
{
tmplist = c()
for (k in 1:length(colnum))
{
tmplist[k] = names(maxpar_node_vals)[colnum[k]]
}
tmpstr = paste(tmplist, collapse="&")
maxpar_node_MP[j] = tmpstr
} else {
maxpar_node_MP[j] = names(maxpar_node_vals)[colnum]
}
}
ACE_pars_estimates[, i] = maxpar_node_MP
# plot results
tree_ht = get_max_height_tree(chtr2)
label_offset = 0.05*tree_ht
plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)
if (num_uniq[i] == 2)
{
colors = c("yellow", "blue")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors[1]
tmpcols[tmpcols == 1] = colors[2]
}
if (num_uniq[i] == 3)
{
colors = c("yellow", "blue", "red")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors[1]
tmpcols[tmpcols == 1] = colors[2]
tmpcols[tmpcols == 2] = colors[3]
}
tiplabels(pch = 19, col = tmpcols, cex = 2) #adj = 0,
legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)
nodelabels(thermo=ACE_pars$lik.anc, cex=1, piecol=colors)
titletxt = paste("ML, parsimony-like reconstruction of\n", names(char_dtf)[i], " character", sep="")
title(titletxt, cex=0.5)
}
ACE_pars_estimates = adf(ACE_pars_estimates)
names(ACE_pars_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_pars_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)]
#row.names(ACE_pars_estimates) = row.names(char_dtf)
(ACE_pars_estimates)
write_table_good(ACE_pars_estimates, "ACE_pars_estimates_v8.txt")
# Do ML reconstruction, with varying branch lengths (i.e. not branches = 1 like "parsimony")
par(mfrow=c(1,2))
ACE_ML_estimates = array(NA, dim=c(num_internal_nodes, ncol(char_dtf)) )
for (i in 1:ncol(char_dtf))
{
(ACE_ML = ace(char_dtf[,i], chtr2, type="discrete", method="ML", model="ER", ip=0.3))
# convert to the single best ML estimate (plurality vote)
maxML_node_vals = ACE_ML$lik.anc
maxML_node_maxes = apply(maxML_node_vals, 1, max)
maxML_node_vals = adf(maxML_node_vals)
maxML_node_best = array(NA, dim=c(num_internal_nodes, 1))
for (j in 1:nrow(maxML_node_vals))
{
tmpTFs = round(maxML_node_vals[j,], 3) == round(maxML_node_maxes[j], 3)
names(tmpTFs)
colnum = which(tmpTFs)
if (length(colnum) > 1)
{
tmplist = c()
for (k in 1:length(colnum))
{
tmplist[k] = names(maxML_node_vals)[colnum[k]]
}
tmpstr = paste(tmplist, collapse="&")
maxML_node_best[j] = tmpstr
} else {
maxML_node_best[j] = names(maxML_node_vals)[colnum]
}
}
#cbind(maxML_node_vals, maxML_node_maxes, maxML_node_best)
ACE_ML_estimates[, i] = maxML_node_best
# plot results
tree_ht = get_max_height_tree(chtr2)
label_offset = 0.05*tree_ht
plot(chtr2, show.node.label=FALSE, cex=0.9, no.margin = TRUE, x.lim=1.6*tree_ht, label.offset=label_offset)
if (num_uniq[i] == 2)
{
colors = c("yellow", "blue")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors[1]
tmpcols[tmpcols == 1] = colors[2]
}
if (num_uniq[i] == 3)
{
colors = c("yellow", "blue", "red")
tmpcols = char_dtf[,i]
tmpcols[tmpcols == 0] = colors[1]
tmpcols[tmpcols == 1] = colors[2]
tmpcols[tmpcols == 2] = colors[3]
}
tiplabels(pch = 19, col = tmpcols, cex = 2) #adj = 0,
legend(0, 3, legend=as.character(0:max(char_dtf[,i])), fill=colors)
nodelabels(thermo=ACE_ML$lik.anc, cex=1, piecol=colors)
titletxt = paste("ML, equal rates reconstruction of\n", names(char_dtf)[i], " character", sep="")
title(titletxt, cex=0.5)
}
ACE_ML_estimates = adf(ACE_ML_estimates)
names(ACE_ML_estimates) = names(char_dtf)
pars_tr_dtf = prt(pars_tr)
row.names(ACE_ML_estimates) = tree_dtf$node[ (length(pars_tr$tip.label)+1): length(pars_tr_dtf$node)]
#row.names(ACE_ML_estimates) = row.names(char_dtf)
(ACE_ML_estimates)
write_table_good(ACE_ML_estimates, "ACE_ML_estimates_v8.txt")
# Write out the trees we generates, also...
write.tree(tr, "tr.newick")
write.tree(tr2, "tr2.newick")
write.tree(chtr2, "chtr2.newick")