Generating Random Trees Testing Cladogram Imbalance
##############################################################
# =============================================
# Lab 11: Tree simulation and tree shape in R
# =============================================
# by Nick Matzke (and whoever else adds to this PhyloWiki page)
# Copyright 2011-infinity
# matzkeATberkeley.edu
# March 2011
#
# Please link/cite if you use this, email me if you have 
#   thoughts/improvements/corrections.
#
##############################################################
#
# Free to use/redistribute under:
# Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) 
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the above license, linked here:
# 
# http://creativecommons.org/licenses/by-nc-sa/3.0/
# 
# Summary:
#
# You are free:
#
#   * to Share -- to copy, distribute and transmit the work
#   * to Remix -- to adapt the work
#
# Under the following conditions:
#
#   * Attribution -- You must attribute the work in the manner 
#     specified by the author or licensor (but not in any way that 
#     suggests that they endorse you or your use of the work).
#   * Noncommercial -- You may not use this work for commercial purposes. 
#
#   * Share Alike -- If you alter, transform, or build upon this work,
#     you may distribute the resulting work only under the same or
#     similar license to this one. 
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
# 
###################################################################
#
# Assignment for Thursday, March 10
#
# Run the code below, and answer the questions at the end in email.
#
# Please make the subject of your email:
# "IB200B lab11 by [your name]"
# Thanks!
# Nick
# 
###################################################################

# setup
library(ape)

# Rabosky's laser package
# does Yule trees, but in table form
#library(laser)

# Hmm, doesn't install correctly
# library(diversitree)

# for birthdeath.tree
library(geiger)

# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_conus_analyses_v1.R")
source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R")

# Let's generate some random trees
numtips = 20

# branching times determined by coalescent process (nodes pushed towards tips)
coalescent_tree = rcoal(numtips, br="coalescent")
plot(coalescent_tree)
title("Random coalescent tree")

# branching times under a uniform distribution (non-ultrametric)
runif_tree = rtree(numtips)
plot(runif_tree)
title("Random uniform tree")

# make branch lengths Grafen
rgraf_tree = compute.brlen(runif_tree, method="Grafen")
plot(rgraf_tree)
title("Random tree with Grafen branch lengths")

# Simulate trees under the Yule (pure-birth) process
birthrate = 1
deathrate = 0
yuletree = birthdeath.tree(b=birthrate, d=deathrate, taxa.stop=numtips)
plot(yuletree)
title("Tree simulated under the pure-birth process")

# Simulate trees under the birth-death process
# NOTE: Birth-death trees sometimes die out by accident.
# So, re-run the code several times until you get a tree
# that actually gets up to 20 taxa.
birthrate = 1
deathrate = 0.5
bdtree = birthdeath.tree(b=birthrate, d=deathrate, taxa.stop=numtips)
plot(bdtree)
title("Tree simulated under the birth-death process")

# the bdtree contains extinct ("fossil") lineages, which means it's not 
# ultrametric.  So we'll do Grafen branch lengths again.  NOTE: DON'T 
# DO THIS AT HOME, IT IS PRETTY MEANINGLESS EVEN AS A SIMULATION,
# THIS IS JUST A "FOR INSTANCE" TO SHOW HOW THE CODE WORKS
bdtree2 = compute.brlen(bdtree, method="Grafen")
plot(bdtree2)
title("Tree simulated under the birth-death process, and arbitrarily changing\nthe branch lengths using Grafen method to make ultrametric")

# Calculate tree balance for each tree;
# we are going to calculate the number of 
# branches on each side of the basal split
bal_ctree = balance(coalescent_tree)
(ctree_RminusL = bal_ctree[1,2] - bal_ctree[1,1])

bal_rtree = balance(runif_tree)
(rtree_RminusL = bal_rtree[1,2] - bal_rtree[1,1])

bal_rgtree = balance(rgraf_tree)
(rgtree_RminusL = bal_rgtree[1,2] - bal_rgtree[1,1])

bal_ytree = balance(yuletree)
(ytree_RminusL = bal_ytree[1,2] - bal_ytree[1,1])

bal_bdtree = balance(bdtree)
(bdtree_RminusL = bal_bdtree[1,2] - bal_bdtree[1,1])

bal_bdtree2 = balance(bdtree2)
(bdtree2_RminusL = as.numeric(bal_bdtree2[1,2] - bal_bdtree2[1,1]) )

# QUESTION: Do the branch lengths matter for this statistic?

# What's the distribution of:
#
# (number of tips on left side of the tree 
# ...minus...
# number of tips on the right side of tree)
#
# ...if you do a bunch of simulations?

# set number of simulations
numsims = 100
numtips = 20
# make an empty array
balance_matrix = matrix(NA, nrow=numsims, ncol=4)
for (i in 1:numsims)
    {
    cat("Simulation #", i, "\n", sep="")

    # simulate the trees:
    coalescent_tree = rcoal(numtips, br="coalescent")
    runif_tree = rtree(numtips)

    # Yule tree
    birthrate = 1
    deathrate = 0
    yuletree = birthdeath.tree(b=birthrate, d=deathrate, taxa.stop=numtips)

    # simulate a birth-death tree until you get 20 tips
    birthrate = 1
    deathrate = 0.5
    for (j in 1:100)
        {
        #cat("Trying bdtree #", j, "\n", sep="")
        bdtree = birthdeath.tree(b=birthrate, d=deathrate, taxa.stop=numtips)
        bdtree_table = prt(bdtree, printflag=FALSE)

        #print(sum(round(bdtree_table$time_bp, digits=5) == 0))
        numtips_observed = sum(round(bdtree_table$time_bp, digits=5) == 0)
        if (numtips_observed >= numtips)
            {
            rows_to_drop = bdtree_table[bdtree_table$node.type=="tip", ]
            labels_to_drop = rows_to_drop[(round(rows_to_drop$time_bp, digits=5) > 0), ]$label
            bdtree = drop.tip(bdtree, labels_to_drop)
            #bdtree = read.tree(text=fixtree(bdtree))

            # Sometime drop.tip messes up the tree, control for this
            if (length(bdtree$tip.label) < numtips)
                {
                foo = 2
                } else {
                # yay, you have a good tree, break out and continue
                break
                }
            }
        }

    # Calculate tree balance for each tree;
    # we are going to calculate the number of 
    # branches on each side of the basal split
    bal_ctree = balance(coalescent_tree)
    (ctree_RminusL = bal_ctree[1,2] - bal_ctree[1,1])

    bal_rtree = balance(runif_tree)
    (rtree_RminusL = bal_rtree[1,2] - bal_rtree[1,1])

    bal_ytree = balance(yuletree)
    (ytree_RminusL = bal_ytree[1,2] - bal_ytree[1,1])

    bal_bdtree = balance(bdtree)
    (bdtree_RminusL = bal_bdtree[1,2] - bal_bdtree[1,1])

    # fill in a temporary row
    tmprow = c(ctree_RminusL, rtree_RminusL, ytree_RminusL, bdtree_RminusL)

    balance_matrix[i, ] = tmprow
    }

# convert to a data frame (adf)
balance_matrix = adf(balance_matrix)

# add column names
names(balance_matrix) = c("ctree", "rtree", "ytree", "bdtree")

# plot histograms

# 4 subplots
par(mfrow=c(2,2))
for (i in 1:ncol(balance_matrix))
    {
    hist(balance_matrix[,i], breaks=numsims/10, main=names(balance_matrix)[i], xlab="#right tips - # left tips", ylab="count")
    }

# It will take a few seconds to run 100 simulations for each of the 4 tree processes.
# QUESTION: Do you see any differences in the distributions?
#
# Now, change numsims to 1000 and re-run (this will take a few minutes, smoke 'em 
# if you got 'em).  
#
# QUESTION: What do the distributions look like now?  If your observed data 
# gave you a tree with 1 tip on the left, and 19 on the right, would you say
# that this observation has a low p-value and thus was unlikely to be produced
# by your null model?
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License