Example Biogeobears Scripts

(Back-link to BioGeoBEARS)

Get your list of ranges/states, from your list of areas

(link to this section)

# Get your states list (assuming, say, 4-area analysis, with max. rangesize=4)
max_range_size = 4
#areas = getareas_from_tipranges_object(tipranges)
areas = c("A", "B", "C", "D")

# This is the list of states/ranges, where each state/range
# is a list of areas, counting from 0
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)

# Make the list of ranges
ranges_list = NULL
for (i in 1:length(states_list_0based))
    {    
    if ( (length(states_list_0based[[i]]) == 1) && (is.na(states_list_0based[[i]])) )
        {
        tmprange = "_"
        } else {
        tmprange = paste(areas[states_list_0based[[i]]+1], collapse="")
        }
    ranges_list = c(ranges_list, tmprange)
    }

# Look at the ranges list
ranges_list

Manual modification of states_list (list of geographic ranges, a.k.a. the list of states in the state space)

(link to this section)

BioGeoBEARS users sometimes want to eliminate geographic ranges that are "disjunct", i.e. that consist of areas that leave out a connecting area in the middle. There are two options to do this:

  1. Attempt to use the areas_adjacency file option — but this only works for the simplest cases (e.g., Hawaiian islands where the only allowed areas are 2 adjacent islands), with a low max_range_size.
  2. Manually modify the states_list. This strategy can allow/disallow any combination of states/ranges that you like, in any time bin, but it requires you to understand how to manually modify the default states_list.

The code below shows how to do a manual modification of the states_list in BioGeoBEARS. The states_list is list of geographic ranges, a.k.a. the list of states in the state space.

(NOTE: People often confuse "areas" and "states/ranges". States/ranges are made up of areas, states/ranges and areas are not the same thing!)

This code is useful because the "areas_adjacency" file is only useful for the simplest cases.

#######################################################
# START MANUAL MODIFICATION OF STATES LIST
# Manually modify the list of states (geographic ranges)
# to disallow some ranges that are disjunct
# (allow more complex scenarios than the area-adjacency file)
#######################################################

#######################################################
# NOTE! "areas" and "states/ranges" are DIFFERENT THINGS
# NOTHING WILL MAKE SENSE UNLESS you understand that a 
# STATE/RANGE is made up of some number of areas.
#
# E.g.: 2 areas (A,B) equals 
# 4 states/geographic ranges (null, A, B, AB)
#######################################################

# Get your states list (assuming, say, 4-area analysis, with max. rangesize=4)
max_range_size = 4
areas = getareas_from_tipranges_object(tipranges)
#areas = c("A", "B", "C", "D")

# This is the list of states/ranges, where each state/range
# is a list of areas, counting from 0
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)

# How many states/ranges, by default: 163
length(states_list_0based)

# Make the list of ranges
ranges_list = NULL
for (i in 1:length(states_list_0based))
    {    
    if ( (length(states_list_0based[[i]]) == 1) && (is.na(states_list_0based[[i]])) )
        {
        tmprange = "_"
        } else {
        tmprange = paste(areas[states_list_0based[[i]]+1], collapse="")
        }
    ranges_list = c(ranges_list, tmprange)
    }

# Look at the ranges list
ranges_list

# How many states/ranges, by default: 163
length(ranges_list)

# Let's remove some non-adjacent ranges
nonadjacent=c("AC","AD","AE","AF","AG","AH","BE","BG","BH","CG","CH","DF","DG","DH","EH")
keepTF = ranges_list %in% nonadjacent == FALSE

ranges_list_NEW = ranges_list[keepTF]
length(ranges_list_NEW)     # now 148

states_list_0based_NEW = states_list_0based[keepTF]
length(states_list_0based_NEW)     # now 148

# INPUT the NEW states list into the BioGeoBEARS_run_object
BioGeoBEARS_run_object$states_list = states_list_0based_NEW

#######################################################
# END MANUAL MODIFICATION OF THE STATES LIST
#######################################################

Re-using the manually modified of states_list

(link to this section)

If every model is going to use the same modified list of states, you can just re-input states_list_0based_NEW into every new model you set up. Do this at the end of the model set-up (in case you have some other modifications to the list of states, in which case you will have to modify your modification of the states list).

# INPUT the NEW states list into the BioGeoBEARS_run_object
BioGeoBEARS_run_object$states_list = states_list_0based_NEW

Manually modify the states list in time-stratified analyses

(link to this section)

Users can also modify the states list in each time-bin in a time-stratified analysis. Set up a time-stratified analysis like usual, then at the end, input a states_list for each time-bin.

#######################################################
# START MANUAL MODIFICATION OF STATES LIST
# Manually modify the list of states (geographic ranges)
# to disallow some ranges that are disjunct
# (allow more complex scenarios than the area-adjacency file)
#######################################################

#######################################################
# NOTE! "areas" and "states/ranges" are DIFFERENT THINGS
# NOTHING WILL MAKE SENSE UNLESS you understand that a 
# STATE/RANGE is made up of some number of areas.
#
# E.g.: 2 areas (A,B) equals 
# 4 states/geographic ranges (null, A, B, AB)
#######################################################

# Get your states list (assuming, say, 4-area analysis, with max. rangesize=4)
max_range_size = 4
areas = getareas_from_tipranges_object(tipranges)
#areas = c("A", "B", "C", "D")

# This is the list of states/ranges, where each state/range
# is a list of areas, counting from 0
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)

# How many states/ranges, by default: 163
length(states_list_0based)

# Make the list of ranges
ranges_list = NULL
for (i in 1:length(states_list_0based))
    {    
    if ( (length(states_list_0based[[i]]) == 1) && (is.na(states_list_0based[[i]])) )
        {
        tmprange = "_"
        } else {
        tmprange = paste(areas[states_list_0based[[i]]+1], collapse="")
        }
    ranges_list = c(ranges_list, tmprange)
    }

# Look at the ranges list
ranges_list

# How many states/ranges, by default: 163
length(ranges_list)

# Let's remove some non-adjacent ranges

# STRATUM 1 (youngest)
nonadjacent1=c("AC","AD","AE","AF","AG","AH","BE","BG","BH","CG","CH","DF","DG","DH","EH")
keepTF1 = ranges_list %in% nonadjacent1 == FALSE
ranges_list_NEW1 = ranges_list[keepTF1]
length(ranges_list_NEW1)     # now 148

# STRATUM 2
nonadjacent2=c("AC","AD","AE","AF","AG","AH","BE","BG","BH","CG","CH","DF","DG")
keepTF2 = ranges_list %in% nonadjacent2 == FALSE
ranges_list_NEW2 = ranges_list[keepTF2]
length(ranges_list_NEW2)

# STRATUM 3 (oldest)
nonadjacent3=c("AC","AD","AE","AF","AG","AH","BE","BG","BH","CG","CH")
keepTF3 = ranges_list %in% nonadjacent3 == FALSE
ranges_list_NEW3 = ranges_list[keepTF3]
length(ranges_list_NEW3)

# Make the stratum-specific states list
states_list_0based_NEW1 = states_list_0based[keepTF1]
states_list_0based_NEW2 = states_list_0based[keepTF2]
states_list_0based_NEW3 = states_list_0based[keepTF3]

# INPUT the NEW states list into the BioGeoBEARS_run_object, STRATIFIED
BioGeoBEARS_run_object$lists_of_states_lists_0based[[1]] = states_list_0based_NEW1
BioGeoBEARS_run_object$lists_of_states_lists_0based[[2]] = states_list_0based_NEW2
BioGeoBEARS_run_object$lists_of_states_lists_0based[[3]] = states_list_0based_NEW3
#######################################################
# END MANUAL MODIFICATION OF THE STATES LIST
#######################################################

Obviously, this allows you to make different states_list_0based_NEW objects, and use those instead of re-using the same one in each time-bin.

Table of probabilities of states/ranges at each node

(link to this section)

If you want a get a table of the probabilities of each state (geographic range) at each node — for e.g. further plotting or analysis — follow the instructions below. This assumes you have loaded your phylogeny to "tr" and your BioGeoBEARS results to "res".

Based on this BioGeoBEARS Google Group post: https://groups.google.com/forum/#!topic/biogeobears/_Ug7UALO4Fw

# This can be run right after e.g. the Psychotria DEC
# example. You would have to change it if you want: 
# - different area names than K O M H, 
# - or have a different number of areas
# - or have a different max_range_size

####################################
# Probabilities of states/ranges at each node
####################################
# To get the probabilities of each state/range at each node:
# What you want, if "res" is your results object, is:
res$ML_marginal_prob_each_state_at_branch_top_AT_node

# In this table:
# - columns are states/ranges
# - rows are nodes, in APE order (tips, then root, then internal)

#  You can see the node numbers in the same APE order with:
trtable = prt(tr, printflag=FALSE)
head(trtable)
tail(trtable)

# You can plot APE node labels with:
plot(tr)
axisPhylo()
nodelabels()
tiplabels(1:length(tr$tip.label))

# Get your states list (assuming, say, 4-area analysis, with max. rangesize=4)
max_range_size = 4
areas = getareas_from_tipranges_object(tipranges)

# This is the list of states/ranges, where each state/range
# is a list of areas, counting from 0
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)

# Make the list of ranges
ranges_list = NULL
for (i in 1:length(states_list_0based))
    {    
    if ( (length(states_list_0based[[i]]) == 1) && (is.na(states_list_0based[[i]])) )
        {
        tmprange = "_"
        } else {
        tmprange = paste(areas[states_list_0based[[i]]+1], collapse="")
        }
    ranges_list = c(ranges_list, tmprange)
    }

# Look at the ranges list
ranges_list

# Make the node numbers the row names
# Make the range_list the column names
range_probabilities = as.data.frame(res$ML_marginal_prob_each_state_at_branch_top_AT_node)
row.names(range_probabilities) = trtable$node
names(range_probabilities) = ranges_list

# Look at the table (first six rows)
head(range_probabilities)

# Write the table to a tab-delimited text file (for Excel etc.)
write.table(range_probabilities, file="range_probabilities.txt", quote=FALSE, sep="\t")

# Look at the file
moref("range_probabilities.txt")

Plot per-area probability (probability of each each area at each node)

(link to this section)

The code below produces a plot of per-area probabilities. Especially for complex geography (many areas), this may be easier for readers to interpret. Making a plot that is readable can require some manual editing of box sizes, positioning, etc., so I have shown how to do that.

It also produces probs_each_area, a table with the per-area probabilities (columns) by node (rows). You can also use infprobs_to_probs_of_each_area() to get this table.

Note: This script assumes you have just run the default Psychotria example script: http://phylo.wikidot.com/biogeobears#script

See the graphics, below.

#######################################################
# Plot DEC/DEC+J M0 per-area probabilities -- with COLOR
# 
# (This assumes you have run a standard BioGeoBEARS analysis
#  from the main example script first)
#######################################################

wd = "set_this_to_wherever_you_like"
setwd(wd)

pdffn = "per-area_probabilities_example_DEC_DECJ_M0_v1_wColor.pdf"
pdf(pdffn, height=6, width=6)

areas = getareas_from_tipranges_object(tipranges)
states_list_0based = rcpp_areas_list_to_states_list(areas=areas, maxareas=max_range_size, include_null_range=TRUE)

barwidth_proportion = 0.02
barheight_proportion = 0.025
split_reduce_x = 0.85    # fraction of sizes for the split probabilities
split_reduce_y = 0.85    # fraction of sizes for the split probabilities

# Manual offsets, if desired (commented out below)
offset_nodenums = NULL
offset_xvals = NULL
offset_yvals = NULL

root.edge = TRUE

# DEC
titletxt = "Psychotria: probability of occupancy per area\nunder DEC M0 analysis"

# Plot at nodes
probs_each_area = plot_per_area_probs(tr, res=resDEC, areas=areas, states_list_0based=states_list_0based, titletxt=titletxt, cols_each_area=TRUE, barwidth_proportion=barwidth_proportion, barheight_proportion=barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)
print("Waiting...")
Sys.sleep(2)    # wait for this to finish

# Calculate per-area probabilities for corners
relprobs_matrix = resDEC$ML_marginal_prob_each_state_at_branch_bottom_below_node
probs_each_area = infprobs_to_probs_of_each_area(relprobs_matrix, states_list=states_list_0based)

# Add to left corners
#offset_nodenums = c(20, 21)
#offset_xvals = c(0, 0)
#offset_yvals = c(-1, -0.25)
add_per_area_probs_to_corners(tr, probs_each_area, left_or_right="left", cols_each_area=TRUE, barwidth_proportion=split_reduce_x*barwidth_proportion, barheight_proportion=split_reduce_y*barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)
print("Waiting...")
Sys.sleep(1)    # wait for this to finish

# Add to right corners
#offset_nodenums = c(19, 17, 20, 15)
#offset_xvals = c(0, 0, 0, 0)
#offset_yvals = c(1.25, 2, 1, 1.25)
add_per_area_probs_to_corners(tr, probs_each_area, left_or_right="right", cols_each_area=TRUE, barwidth_proportion=split_reduce_x*barwidth_proportion, barheight_proportion=split_reduce_y*barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)
print("Waiting...")
Sys.sleep(1)    # wait for this to finish

# DEC+J
titletxt = "Psychotria: probability of occupancy per area\nunder DEC+J M0 analysis"

offset_nodenums = NULL
offset_xvals = NULL
offset_yvals = NULL

# Plot at nodes
probs_each_area = plot_per_area_probs(tr, res=resDECj, areas=areas, states_list_0based=states_list_0based, titletxt=titletxt, cols_each_area=TRUE, barwidth_proportion=barwidth_proportion, barheight_proportion=barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)
Sys.sleep(1)    # wait for this to finish

# Calculate per-area probabilities for corners
relprobs_matrix = resDECj$ML_marginal_prob_each_state_at_branch_bottom_below_node
probs_each_area = infprobs_to_probs_of_each_area(relprobs_matrix, states_list=states_list_0based)

# Add to left corners
#offset_nodenums = c(20, 21)
#offset_xvals = c(0, 0)
#offset_yvals = c(-1, -0.25)
add_per_area_probs_to_corners(tr, probs_each_area, left_or_right="left", cols_each_area=TRUE, barwidth_proportion=split_reduce_x*barwidth_proportion, barheight_proportion=split_reduce_y*barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)

# Add to right corners
#offset_nodenums = c(19, 17, 20, 15)
#offset_xvals = c(0, 0, 0, 0)
#offset_yvals = c(1.25, 2, 1, 1.25)
add_per_area_probs_to_corners(tr, probs_each_area, left_or_right="right", cols_each_area=TRUE, barwidth_proportion=split_reduce_x*barwidth_proportion, barheight_proportion=split_reduce_y*barheight_proportion, offset_nodenums=offset_nodenums, offset_xvals=offset_xvals, offset_yvals=offset_yvals, root.edge=root.edge)

dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)

Plot graphics

(link to this section)

BioGeoBEARS_per-area_probabilities_plot_Psychotria_DEC_M0.png
BioGeoBEARS_per-area_probabilities_plot_Psychotria_DEC%2BJ_M0.png
Figure Caption: Plot of per-area probabilities for Hawaiian Psychotria under the DEC model. Figure Caption: Plot of per-area probabilities for Hawaiian Psychotria under the DEC+J model.

Node numbering in APE/BioGeoBEARS

(link to this section)

Using prt() to make a table of node numbers, branch lengths, etc.

(link to this section)

Note: R's tree structure is pretty non-intuitive, compared to the "a tree is a collection of node objects" structure that is taught in most phylogenetics courses and used in e.g. C++ software. It's done this way because R likes lists, and doesn't like objects.

I wrote the function prt() to help me figure it out:

# Example hominin tree with 2 fossils
trstr = "(((((Hsapiens:0.4,Hneander:0.4):4.6,Ardi:0.4):1.0,Pan:6.0):1.0,Gorilla:7.0):6.0,Pongo:13.0);"
tr = read.tree(file="", text=trstr)

# The APE "phylo" object, tr, has 4 items in it. You can see them with the names() command:
names(tr)
# "edge"        "Nnode"       "tip.label"   "edge.length"

# If you look at each one, you will see the tree structure
tr$edge
#      [,1] [,2]
# [1,]    7    8
# [2,]    8    9
# [3,]    9   10
# [4,]   10   11
# [5,]   11    1
# [6,]   11    2
# [7,]   10    3
# [8,]    9    4
# [9,]    8    5
#[10,]    7    6
#
# This is the edge matrix.  The rows are edges (branches). 
# The left column is ancestral nodes (the nodes at the bottom of branches)
# The right column is descendant nodes (the nodes at the top of branches)
# NOTE: THIS MEANS THAT, UNLIKE MOST COMPUTER PHYLOGENY STRUCTURES,
# IN APE, THE NODES ARE IMPLICIT. THIS CAUSES MADDENING PROBLEMS
# FOR THINGS LIKE ALTERING TREES, SINCE ANY ALTERATION CAN
# REQUIRE RE-NUMBERING ALL OF THE NODES

# tr$Nnode describes the number of INTERNAL nodes
tr$Nnode
# [1] 5
# 
# In a fully bifurcating tree, the number of internal nodes is (numtips-1). But in a 
# tree with polytomies, it can be fewer.

# tr$tip.label contains the tip labels
# Note: tips are nodes, and they get node numbers 1-5 (since we have 5 tips here)
tr$tip.label
# [1] "Hsapiens" "Hneander" "Ardi"     "Pan"      "Gorilla"  "Pongo"

# NOTE: The root node is always node number numtips+1
# Therefore, to get the node number of the root:
node_number_of_root_node = length(tr$tip.label) + 1

# NOTE ALSO: You can get the total number of nodes like this:
numtips = length(tr$tip.label)
num_internal_nodes = tr$Nnode
total_number_of_nodes = numtips + num_internal_nodes

# Branch lengths:
tr$edge.length
#  [1]  6.0  1.0  1.0  4.6  0.4  0.4  0.4  6.0  7.0 13.0
# NOTE: These branch lengths are in the same order as the rows of tr$edge

# Use prt() to print a nice tree table, with all of the node numbers explicit
# options:
# * printflag=FALSE means don't print to screen immediately
# * get_tipnames=TRUE creates a column, tr_table$tipnames, that lists all of the tipnames
#    descending from each node, sorted alphabetically and comma-delimited. This can
#    be VERY USEFUL for comparing trees, since different trees in R will NOT 
#    have the same node numbers, even if they have same taxa.
# * fossils_older_than=0.01 means that any tip more than 0.01 below the 
#    highest tip will be called a fossil in the column tr_table$fossils. This is not
#    set to 0, because sometimes trees have slight errors and not all of the 
#    living tips come up to exactly 0 million years before present.
tr_table = prt(t=tr, printflag=FALSE, get_tipnames=TRUE, fossils_older_than=0.01)

# Print the tree table to screen:
tr_table

#    node ord_ndname node_lvl node.type parent_br edge.length ancestor daughter_nds node_ht
# 1     1          1        5       tip         5         0.4       11                 13.0
# 2     2          2        5       tip         6         0.4       11                 13.0
# 3     3          3        4       tip         7         0.4       10                  8.4
# 4     4          4        3       tip         8         6.0        9                 13.0
# 5     5          5        2       tip         9         7.0        8                 13.0
# 6     6          6        1       tip        10        13.0        7                 13.0
# 7     7          7        0      root        NA          NA       NA         8, 6     0.0
# 8     8          8        1  internal         1         6.0        7         9, 5     6.0
# 9     9          9        2  internal         2         1.0        8        10, 4     7.0
# 10   10         10        3  internal         3         1.0        9        11, 3     8.0
# 11   11         11        4  internal         4         4.6       10         1, 2    12.6
#    time_bp fossils    label                                 tipnames
# 1      0.0   FALSE Hsapiens                                 Hsapiens
# 2      0.0   FALSE Hneander                                 Hneander
# 3      4.6    TRUE     Ardi                                     Ardi
# 4      0.0   FALSE      Pan                                      Pan
# 5      0.0   FALSE  Gorilla                                  Gorilla
# 6      0.0   FALSE    Pongo                                    Pongo
# 7     13.0      NA  inNode7 Ardi,Gorilla,Hneander,Hsapiens,Pan,Pongo
# 8      7.0      NA  inNode8       Ardi,Gorilla,Hneander,Hsapiens,Pan
# 9      6.0      NA  inNode9               Ardi,Hneander,Hsapiens,Pan
# 10     5.0      NA inNode10                   Ardi,Hneander,Hsapiens
# 11     0.4      NA inNode11                        Hneander,Hsapiens
# 

# See the column headings:
names(tr_table)

# First few rows with the head() command:
head(tr_table)

# Last few rows with the tail() command:
tail(tr_table)

# NOTE: The root node has no parent branch or edge length, because it's 
#            the root node
rootnodenum = length(tr$tip.label) + 1

# Show just the row with the root node
tr_table[rootnodenum, ]
#   node ord_ndname node_lvl node.type parent_br edge.length ancestor daughter_nds node_ht
# 7    7          7        0      root        NA          NA       NA         8, 6       0
#   time_bp fossils   label                                 tipnames
# 7      13      NA inNode7 Ardi,Gorilla,Hneander,Hsapiens,Pan,Pongo

# That's it!  It's good to look at both the table, and the plot of APE node numbers, and compare:

ntips = length(tr$tip.label)
Rnodenums = (ntips+1):(ntips+tr$Nnode)
tipnums = 1:ntips
plot(tr, label.offset=0.25, cex=1.25)
axisPhylo()
tiplabels(cex=1.5)
nodelabels(text=Rnodenums, node=Rnodenums, cex=1.5)
title("APE/BioGeoBEARS node numbers")

Plot of APE/BioGeoBEARS node numbers

(link to this section)

Here is that image of APE/BioGeoBEARS node numbers:

APE_BioGioBEARS_node_numbers.png

Comparing node numbers from APE/BioGeoBEARS, Python Lagrange, C++ Lagrange, and DIVA

(link to this section)

#######################################################
# This script shows how nodes are labeled in different
# biogeography software. Annoyingly, APE, Python 
# Lagrange, C++ Lagrange, and DIVA each use a 
# different node numbering scheme!  (The great thing 
# about standards is that everybody has one!)
# 
# Having the node numbering is mostly useful to 
# translate the node numbers from other programs 
# into R node numbers for plotting in R/BioGeoBEARS.
#
# Also note that if your goal is to e.g. compare 
# the results of a biogeography analysis on different
# trees from a Bayesian posterior distribution, node 
# numbers will not match between trees, which could 
# have tips, clades etc. in a different order.
# 
# (It gets even more complicated if you wish to match
# corners as well as nodes. I have done it for 
# Lagrange and BioGeoBEARS, but it is not trivial.)
# 
# All scripts are copyright Nicholas J. Matzke, 
# please cite if you use. License: GPL-3
# http://cran.r-project.org/web/licenses/GPL-3
# 
#######################################################

# Setup
library(ape)
library(BioGeoBEARS)    # for get_lagrange_nodenums(), postorder_nodes_phylo4_return_table()
library(phylobase)        # required for postorder_nodes_phylo4_return_table()

# Fix to postorder_nodes_phylo4_return_table
source("http://phylo.wdfiles.com/local--files/biogeobears/BioGeoBEARS_readwrite_v1.R")

# Set your working directory to put the PDF in (pick your own)
# Default
getwd()

# Nick's choice for his working directory
wd = "/drives/Dropbox/_njm/__packages/BioGeoBEARS_setup/_examples_for_phylowiki/"
setwd(wd)

# Double-check
getwd()
list.files()

#######################################################
# Node numbering in different programs
#######################################################

# Example hominin tree with 2 fossils
trstr = "(((((Hsapiens:0.4,Hneander:0.4):4.6,Ardi:0.4):1.0,Pan:6.0):1.0,Gorilla:7.0):6.0,Pongo:13.0);"
tr = read.tree(file="", text=trstr)

pdffn = "Node_numbers_v1.pdf"
pdf(pdffn, height=11, width=11)
par(mfrow=c(2,2))

###################################################
# Plot APE/BioGeoBEARS node numbers
###################################################
ntips = length(tr$tip.label)
Rnodenums = (ntips+1):(ntips+tr$Nnode)
tipnums = 1:ntips
plot(tr, label.offset=0.25, cex=1.25)
axisPhylo()
tiplabels(cex=1.5)
nodelabels(text=Rnodenums, node=Rnodenums, cex=1.5)
title("APE/BioGeoBEARS node numbers")

###################################################
# Plot Python LAGRANGE node numbers
###################################################
# Nodenums for C++ Lagrange
downpass_node_matrix = get_lagrange_nodenums(tr)
downpass_node_matrix

# Nodenums for Python Lagrange include
# tip numbers N0 - N(ntips-1)
# Nodenums for Python Lagrange
downpass_node_matrix[,2] = downpass_node_matrix[,2] + ntips - 1

# Sort in LAGRANGE node order
downpass_node_matrix = downpass_node_matrix[order(downpass_node_matrix[,2]),]
downpass_node_matrix

# Plot the tree
plot(tr, cex=1.25)
axisPhylo()
ntips = length(tr$tip.label)
python_lg_nodenums = downpass_node_matrix[,2]
nodelabels(node=Rnodenums, text=python_lg_nodenums, cex=1.5)
title("Python LAGRANGE node numbers")

###################################################
# Plot C++ LAGRANGE node numbers
###################################################
# Nodenums for C++ Lagrange
downpass_node_matrix = get_lagrange_nodenums(tr)
downpass_node_matrix

# Sort in LAGRANGE node order
downpass_node_matrix = downpass_node_matrix[order(downpass_node_matrix[,2]),]
downpass_node_matrix

# Plot the tree
plot(tr, cex=1.25)
axisPhylo()
ntips = length(tr$tip.label)
python_lg_nodenums = downpass_node_matrix[,2]
nodelabels(node=Rnodenums, text=python_lg_nodenums, cex=1.5)
title("C++ LAGRANGE node numbers")

###################################################
# Plot DIVA node numbers
###################################################
# Nodenums for DIVA
tr4 = as( tr, "phylo4")
DIVA_nodes_table = postorder_nodes_phylo4_return_table(tr4)

plot(tr, show.tip.label=TRUE, cex=1.25)
axisPhylo()
nodelabels(node=DIVA_nodes_table$nodenums, text=DIVA_nodes_table$DIVA_postorder, cex=1.5)
#tiplabels()
title("DIVA node numbers")

# END PDF
dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)

Plot comparing APE/BioGeoBEARS, Python Lagrange, C++ Lagrange, and DIVA node numbers

(link to this section)

Here is the plot comparing APE/BioGeoBEARS, Python Lagrange, C++ Lagrange, and DIVA node numbers, that comes from the above code. As they say, "the great thing about standards is that everybody has one"!

Node_numbers_v1.png

Processing output from DIVA, Python Lagrange, and C++, and graphical comparison to BioGeoBEARS

(link to this section)

Over the years I developed a number of functions to parse and plot the output from traditional biogeography programs. Pasting all of this as code would be complex, as reading/writing numerous files is required. Instead, I have made a zipfile with a "dummy" analysis within each program, and a script that reads in the output files and displays the results in R. You will have to customize the working directory to run these, and customize the filenames to run your own output files.

The zipfile is available in "Files", at the bottom of this page, or here: http://phylo.wikidot.com/local--files/example-biogeobears-scripts/AAAB_M0_BL1.zip

Working with BioGeoBEARS output

(link to this section)

#######################################################
# WORKING WITH BIOGEOBEARS OUTPUT
#######################################################
#
# Nick Matzke
# 2014-07-02
#
# Many scientists have now successfully ran the example 
# BioGeoBEARS script, and modified it to run their own data.
# 
# Once they have gotten familiar with the basic method and results,
# they want to know more about the BioGeoBEARS output, what it 
# means, and how to extract certain information from it.
#
# If you are in this category, this short tutorial is written for 
# you!
#
# 
# All scripts are copyright Nicholas J. Matzke, 
# please cite if you use. License: GPL-3
# http://cran.r-project.org/web/licenses/GPL-3
#######################################################

# 
# I am going to start by assuming that you have run the 
# DEC and DEC+J analyses on the example Psychotria dataset.
# 
# You will notice that the final step of these analyses was
# the function bears_optim_run(), e.g.:
# 
# > res = bears_optim_run(BioGeoBEARS_run_object)
# 
# "res" is short for "results".  Because I overwrite "res" 
# during each run, I store it under a new name a few lines
# later:
# 
# > resDEC = res
#
# "resDEC" now contains the results of the ML optimization under
# the DEC model. (resDECj holds the same for DEC+J, etc.)
#

# The key question for users to ask is: "What is in the 
# BioGeoBEARS results object?"
#
# In R, there are several ways to see what is stored in 
# an R object.  First, you can just type the name of the object, 
# and the whole thing will be spit out to the screen:

resDEC

# When you type this, pages and pages of stuff flies by.  This
# typically scares and intimidates new users, if they even 
# notice that pages flew by (sometimes, people make the mistake of 
# just looking at the last page).
# 
# To get a more organized idea of what is stored in resDEC, we 
# can use several standard R functions. The main "detective" 
# functions are class(), length(), names(), and summary().
# 
# First, type class(resDEC):

class(resDEC)

# ...the result is "calc_loglike_sp_results".  This isn't particularly 
# useful in this case, since I haven't yet assigned a 
# good class name like "BioGeoBEARS results object".  If you 
# are interested, though, "calc_loglike_sp_results" means 
# "calculation of log-likelihood with a speciational model
# results", which describes the main stuff inside of resDEC.
#
# An example of a useful result from class() is class(tr):

class(tr)

# This tells you that "tr" is a "phylo" object, i.e. a 
# phylogenetic tree object as used by the R package APE.  You 
# knew this already, this time, but if you didn't class() 
# would have told you.

# 
# So, class() tells you the class, or type, of the object.  Try 
# it first whenever you wonder what the heck some R variable is.
# 
# As class() wasn't hugely helpful for resDEC, we have to do 
# more detective work.  Use the length() function:

length(resDEC)

# The result is "14".  All this tells you is that there are 
# 14 of something inside resDEC.
#
# Wait, 14 of what?  Glad you asked!  We can get the names of the 
# 14 items with the names() command:

names(resDEC)

# This outputs a list of names of items in the results object. Each 
# of these names refers to an object inside the overall resDEC object.
# In other words, resDEC is just a list of other objects.
# 
# To get a little more information, use the summary() command:

summary(resDEC)

# This tells you the length of each sub-object, the class if it 
# has one, and the mode (e.g. numeric (numbers), character (text), 
# list (a list of other objects), etc.
#
# You can see that some objects are big, and some are small.

# To see what is in any particular sub-object, just use the "$"
# symbol.
# 
# For example, type "resDEC$total_loglikelihood"

resDEC$total_loglikelihood

# This gives you -34.5496.  This is the log-likelihood of the 
# geographic range data given the phylogenetic tree and 
# DEC model, with the maximum likelihood (ML) estimates of 
# d (rate of dispersal/range expansion) and 
# e (rate of extinction/range contraction).

# The other most relevant outputs are:
#
# 1. The ancestral state/ancestral range probabilities at each 
# node. These are the product of the downpass probabilities
# times the uppass probabilities. They represent the 
# ancestral state probabilities, conditional on the tip
# data and the specified model parameters.  Typically 
# users will be using the globally optimum parameter values
# estimated by ML.
# 
# (uncomment to print to screen)
# resDEC$ML_marginal_prob_each_state_at_branch_top_AT_node
#
# You can see that in this table, nrows = number of nodes
# (tips+internal), and ncols = number of states (= number 
# of possible ranges).
#
# The rows are in the order of APE's default node numbering
# scheme:
#
# Node number (and then interpretation)
# 1-ntips = tip node numbers
# ntips+1 = root node number
# (ntips+1):(ntips+num_internal_nodes) = internal node numbers
dim(resDEC$ML_marginal_prob_each_state_at_branch_top_AT_node)

#
# 2. The ancestral state/ancestral range probabilities at each 
# "corner", where the corner represents the states 
# instantaneously after the cladogenesis event, or, alternatively, 
# at the very bottom of the branch above.
# 
# (uncomment to print to screen)
# resDEC$ML_marginal_prob_each_state_at_branch_bottom_below_node
#
# The ancestral-ranges-at-corners table also has  nrows = number 
# of nodes (tips+internal), and ncols = number of states (= number 
# of possible ranges).
#
# Unlike the states at nodes/branch tops, discussed above, 
# corner state probabilities represent the states at the 
# branch bottoms, below the relevant node.
dim(resDEC$ML_marginal_prob_each_state_at_branch_top_AT_node)

# These node numbers represent the node numbers of the node 
# at the top of each branch.
# 
# To get the node numbers above each node, use:
# 
get_leftright_nodes_matrix_from_results(tr)
# (although you might have to experiment with switching 
#  "left" and "right", as they mean different things 
#  depending on the plotting program etc.)

# 3. The ML parameter estimates:
# 
resDEC$optim_result
#
# This is just the saved result of the ML search using optimx() or
# optim().  See those functions for interpretation.  The parameter
# estimates are first, the "value" contains the thing you 
# were optimizing/maximizing, in this case the log-likelihood (LnL).
#
# To extract the parameter values automatically, use e.g.:
# 
res2 = extract_params_from_BioGeoBEARS_results_object(results_object=resDEC, returnwhat="table", addl_params=c("j"), paramsstr_digits=4)
res2

# To extract just the LnL:
LnL_2 = get_LnL_from_BioGeoBEARS_results_object(resDEC)
LnL_2

# 4. The input model & run settings:

names(resDEC$inputs)

# resDEC$inputs is just the saved BioGeoBEARS_run_object that was input
# into bears_optim_run.  It is handy to save this in the result object,
# in order to automatic later processing, replication of the analysis, 
# etc.

# 5. The output model object:
resDEC$output

# This has the ML parameter estimates, as well as the initial values, limits, 
# and other parts of the BioGeoBEARS_model_object.
# 
# To access a particular value in the table:
resDEC$output@params_table
resDEC$output@params_table["d", "est"]

BioGeoBEARS legends

(link to this section)

I have added a script on how to build your own legend after running the example script. This could then be manually added to a graphic e.g. in Powerpoint, Photoshop or some other graphics editor. Note that it will work best for small numbers of areas and thus small numbers of states/geographic ranges. If you have e.g. 9 areas, and thus 2^9=512 possible geographic ranges, finding a good color scheme and legend format for those 512 possible states will be difficult-to-impossible without simplification of some sort.

This legend script is based on this post to the BioGeoBEARS google group.

It is difficult to make legends work automatically (e.g. via plotlegends=TRUE), because often there are so many states/ranges, and phylogenies can have many different sizes and shapes.

I originally wanted to add the legend to people's tree plots, but that is very difficult to make work. So I think it's easier for people to just build the legend separately.

Here is a script that does legends / keys for the default Hawaii dataset, or you can paste in the latter part of the script to your own analysis script.

#######################################################
# Example key table, and legend
#######################################################

#######################################################
# Make a table that is the abbreviations key
#######################################################
# Here, the file with the "key" is a tab-delimited text file -- 
# change to your file location if you don't want Hawaii
keyfn = "http://phylo.wdfiles.com/local--files/biogeobears/Hawaii_abbreviations_table.txt"

keydf = read.table(keyfn, header=TRUE, sep="\t", stringsAsFactors=FALSE, fill=TRUE)
keydf

keydf2 = keydf[,c("ab1", "desc")]
names(keydf2) = c("Abbr", "Description")
keydf2

library(gridExtra)    # for grid.table() function
grid.table(keydf2, rows=NULL)

# Defaults, if you want to use the Hawaii names/abbreviations from above
max_range_size = 4
include_null_range = TRUE
areanames = keydf$ab1
areanames

# You could paste the below directly into your script, if it is
# set up like in the PhyloWiki example script

#######################################################
# Plot legend for ALL states/ranges (there may be a 
# ton, and getting them all to display is hard)
#######################################################
pdffn = "colors_legend_all_v1.pdf"
pdf(pdffn, width=6, height=6)

areanames = names(tipranges@df)
areanames

include_null_range = TRUE

states_list_0based_index = rcpp_areas_list_to_states_list(areas=areanames, maxareas=max_range_size, include_null_range=include_null_range)

statenames = areas_list_to_states_list_new(areas=areanames, maxareas=max_range_size, include_null_range=include_null_range, split_ABC=FALSE)
statenames

relprobs_matrix = resDEC$ML_marginal_prob_each_state_at_branch_top_AT_node
MLprobs = get_ML_probs(relprobs_matrix)
MLstates = get_ML_states_from_relprobs(relprobs_matrix, statenames, returnwhat="states", if_ties="takefirst")

colors_matrix = get_colors_for_numareas(length(areanames))
colors_list_for_states = mix_colors_for_states(colors_matrix, states_list_0based_index, plot_null_range=include_null_range)
colors_list_for_states

possible_ranges_list_txt = areas_list_to_states_list_new(areas=areanames,  maxareas=max_range_size, split_ABC=FALSE, include_null_range=include_null_range)
cols_byNode = rangestxt_to_colors(possible_ranges_list_txt, colors_list_for_states, MLstates)

legend_ncol=NULL
legend_cex=1.5
colors_legend(possible_ranges_list_txt, colors_list_for_states, legend_ncol=legend_ncol, legend_cex=legend_cex)

dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)

#######################################################
# Plot legend for SOME states/ranges 
# (especially since the widespread ranges are just
#  combinations of the primary colors used for 
#  single-area ranges)
#######################################################

pdffn = "colors_legend_some_v1.pdf"
pdf(pdffn, width=6, height=6)

# Subset to just some ranges (since there are sooo many combinations)
states_to_put_in_legend = c(1,2,3,4,5)
colors_list_for_states_subset = colors_list_for_states[states_to_put_in_legend]
possible_ranges_list_txt_subset = possible_ranges_list_txt[states_to_put_in_legend]

legend_ncol=NULL
legend_cex=1.5
colors_legend(possible_ranges_list_txt_subset, colors_list_for_states_subset, legend_ncol=legend_ncol, legend_cex=legend_cex)

dev.off()
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)

How (and whether) to collapse tips to prune a tree

(link to this section)

NOTE: See OTUs of the phylogeny should be species/populations, not specimens for background.

#######################################################
# How (and whether) to collapse tips
#######################################################
# People with trees that mix between-species and within-species variation 
# often need to reduce their tree to have just a tree of just species 
# or just monophyletic populations.
#
# This is because BioGeoBEARS (and Lagrange etc.) assume that 
# your phylogeny is a phylogeny of species/monophyletic populations, 
# rather than a tree of specimen samples.
#
# Options include:
# 
# 1. Re-do the analysis with just one OTU per species/monophyletic population 
#    (OTU = "operational taxonomic unit" = tip in a tree)
# 
# 2. Do a full gene tree/species tree analysis, e.g. with starBEAST
# 
# 3. Do a somewhat crude pruning in R, after the fact.
#
# Please note that, officially, it is a somewhat bad idea to estimated a dated 
# tree in BEAST with a mixture of within-species and between species OTUs (this 
# is the precursor to #3).  
#
# This is because the standard BEAST analysis will assume that the tree process is 
# EITHER phylogenetic (when you use a pure-birth (Yule) *OR* birth-death tree prior) 
# or population genetic (when you use a coalescent prior).  When your data mixes 
# these, there is the potential that some part of the tree dating is getting 
# screwed up by having an incorrect prior.
# 
# In my experience, it doesn't seem to matter much, at least for cases where the 
# data are a decent mix of within- and between-population data, there are a number
# of calibration points, and a phylogenetic tree prior is being used. 
# 
# I have seen bad dating problems, though, if there are few calibrations, and the data 
# are mostly within one coalescing population, and a birth-death prior is used.  This 
# pushed the divergence times way too deep and spread out the nodes way too much, 
# compared to the quite clocklike pattern (all the tips about the same height) and 
# coalescent pattern (a large clade of sequences, with very recent divergence times)
# in the undated tree.
# 
# Anyway, reviewers may complain if you use strategy #3. It's up to you to make a 
# good decision about these strategies and decide how to defend it in your paper.
# 
# All that said, here's the code for pruning trees.
###############################################################################

library(ape)    # for read.tree, drop.tip, etc.
library(gdata)     # for trim

library(BioGeoBEARS)    # for prune_specimens_to_species etc.

# REQUIRED patches for this to work
source("http://phylo.wdfiles.com/local--files/biogeobears/BioGeoBEARS_basics_v1.R")
source("http://phylo.wdfiles.com/local--files/biogeobears/BioGeoBEARS_readwrite_v1.R")
source("http://phylo.wdfiles.com/local--files/biogeobears/BioGeoBEARS_classes_v1.R")

# Here is a newick string (you could also load a tree from a newick file with read.tree)
newick_string = "(((sp1:0.03610559932,sp2:0.03610559932)1.0000:0.1776629349,((((sp3:0.008442378445,sp4:0.008442378445)1.0000:0.09122692306,sp5:0.09966930151)0.8380:0.05350909945,sp6:0.153178401)0.7060:0.04118371017,((sp20:0.1646354697,(((((sp21:0.05541959218,sp22:0.05541959218)0.7220:0.02482956203,sp23:0.08024915421)0.3250:0.01090832874,(sp24:0.07252867627,sp25:0.07252867627)0.5000:0.01862880668)0.3060:0.01016242573,sp26:0.1013199087)0.8940:0.05355067998,(sp27:0.1465197179,(sp28:0.1343470201,(sp29:0.1300709381,(((sp30:0.02406821269,sp31:0.02406821269)1.0000:0.03706382207,sp32:0.06113203476)1.0000:0.06390204876,((((sp33:0.02740065156,sp34:0.02740065156)0.6250:0.007508656628,sp35:0.03490930819)0.8450:0.02363732707,sp36:0.05854663526)1.0000:0.06308298802,(sp37:0.09852184783,(sp38:0.01910350891,(sp39:0.007259454956,sp40:0.007259454956)0.8450:0.01184405395)1.0000:0.07941833892)0.8610:0.02310777545)0.3450:0.003404460232)0.1360:0.005036854609)0.1050:0.004276082013)0.2300:0.0121726978)0.1700:0.008350870719)0.4160:0.009764880993)0.7460:0.02738230786,(sp19:0.1322081512,(sp18:0.1108740943,(sp17:0.09751406895,(sp16:0.07347283163,(sp15:0.05565075583,(sp14:0.04956539576,(sp13:0.04331045159,(sp12:0.02278042935,(sp11:0.01471299318,((sp9:0.006538713623,sp10:0.006538713623)0.7890:0.007881519684,(sp7:0.006528757196,sp8:0.006528757196)0.8350:0.007891476111)0.1960:0.0002927598732)0.6500:0.008067436169)1.0000:0.02053002224)0.4060:0.00625494417)0.2690:0.006085360071)0.4080:0.0178220758)0.9690:0.02404123732)0.8090:0.0133600254)0.8600:0.02133405685)0.9990:0.05980962632)0.2810:0.002344333608)0.5880:0.01940642309):0.7862314658,(sp41:0.05865850646,(sp42:0.01278826063,sp43:0.01278826063)1.0000:0.04587024583):0.9413414935);"

tr = read.tree(file="", text=newick_string)
tr

# Look at the tip labels
tr$tip.label

# Make a list of the tip labels
# (you could also load this from a text or Excel file)
old_tipnames = c("sp1",
"sp2",
"sp3",
"sp4",
"sp5",
"sp6",
"sp7",
"sp8",
"sp9",
"sp10",
"sp11",
"sp12",
"sp13",
"sp14",
"sp15",
"sp16",
"sp17",
"sp18",
"sp19",
"sp20",
"sp21",
"sp22",
"sp23",
"sp24",
"sp25",
"sp26",
"sp27",
"sp28",
"sp29",
"sp30",
"sp31",
"sp32",
"sp33",
"sp34",
"sp35",
"sp36",
"sp37",
"sp38",
"sp39",
"sp40",
"sp41",
"sp42",
"sp43")

# Make a list of the NEW tip labels. This must be in 
# the SAME ORDER as the tips in "old_tipnames"
# (it doesn't matter if the tips are in the same 
#  order as the tree)
#
#

new_tipnames = c("sp1",
"sp2",
"sp3",
"sp4",
"sp5",
"sp6",
"sp7",
"spA",
"spA",
"spA",
"spA",
"sp12",
"sp13",
"sp14",
"sp15",
"sp16",
"sp17",
"sp18",
"sp19",
"sp20",
"sp21",
"sp22",
"sp23",
"sp24",
"sp25",
"sp26",
"sp27",
"sp28",
"sp29",
"sp30",
"sp31",
"sp32",
"sp33",
"sp34",
"sp35",
"sp36",
"sp37",
"sp38",
"spB",
"spB",
"sp41",
"sp42",
"sp43")

# Make a data.frame
OTUs = old_tipnames
species = new_tipnames
tmpmat = cbind(OTUs, species)
df = as.data.frame(tmpmat, stringsAsFactors=FALSE)
names(df) = c("OTUs", "species")

# Look at the beginning of the table
head(df)

library(gdata) # for trim

# We'll make a PDF so you can see all the steps
pdffn = "pruning_PDF.pdf"
pdf(pdffn, height=6, width=6)

result = prune_specimens_to_species(original_tr=tr, xls=df, group_name="default", titletxt="", areas_abbr=NULL, plot_intermediate=TRUE)

# stop PDF and then open it
dev.off()
cmdstr = paste0("open ", pdffn)
system(cmdstr)

# Here's the new tree
pruned_tree = result$tr

#######################################################
# If you need to merge geography, you can do that also
#######################################################

# These also have to be in the same order
# (this is all easier to do in Excel with 
#  gdata::read.xls -- but this is just 
#  showing the concept)
region = c("A",
"A",
"A",
"A",
"A",
"A",
"A|B",
"C|D",
"E",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"A",
"F",
"C",
"A",
"A",
"A")

# Make a data.frame
OTUs = old_tipnames
species = new_tipnames
tmpmat = cbind(OTUs, species, region)
df = as.data.frame(tmpmat, stringsAsFactors=FALSE)
names(df) = c("OTUs", "species", "region")

# Look at the beginning of the table:
head(df)

# We'll make a PDF so you can see all the steps
pdffn = "pruning_PDF2.pdf"
pdf(pdffn, height=6, width=6)

result2 = prune_specimens_to_species(original_tr=tr, xls=df, group_name="default", titletxt="", areas_abbr=NULL, plot_intermediate=TRUE)

# stop PDF and then open it
dev.off()
cmdstr = paste0("open ", pdffn)
system(cmdstr)

# Here's the new tree
pruned_tree = result2$tr

# We can write it to a file
write.tree(pruned_tree, file="pruned_tree.newick")

# Look at the Newick file
moref("pruned_tree.newick")

# This time we have a geography object also
tipranges = result2$tipranges

# Write the tipranges to a text file
lgdata_fn = "pruned_geog.data"
save_tipranges_to_LagrangePHYLIP(tipranges=tipranges, lgdata_fn=lgdata_fn)

# Look at the text file
moref(lgdata_fn)
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License