_biogeog_workshop_ML_search_v2.R
##############################################################
# INTRODUCTORY STUFF
#
# This script is by Nick Matzke (and whoever else adds to this PhyloWiki page)
# Copyright 2011-infinity
# matzkeATberkeley.edu
# January 2011
#
# Please link/cite if you use this, email me if you have 
#   thoughts/improvements/corrections.
#
##############################################################
#
# Free to use/redistribute under:
# Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) 
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the above license, linked here:
# 
# http://creativecommons.org/licenses/by-nc-sa/3.0/
# 
# Summary:
#
# You are free:
#
#   * to Share — to copy, distribute and transmit the work
#   * to Remix — to adapt the work
#
# Under the following conditions:
#
#   * Attribution — You must attribute the work in the manner 
#     specified by the author or licensor (but not in any way that 
#     suggests that they endorse you or your use of the work).
#   * Noncommercial — You may not use this work for commercial purposes. 
#
#   * Share Alike — If you alter, transform, or build upon this work,
#     you may distribute the resulting work only under the same or
#     similar license to this one. 
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
# 
###################################################################

########################################################
# SCRIPT DESCRIPTION
# 
# A simple example of a maximum likelihood search for
# simulating and then jointly estimating the probability
# of dispersal between geographic regions, climate zones,
# and elevation zones.
#
# Script #1/2 for:
#
# Evolution of Life on Pacific Islands and Reefs: Past, present, and future
# WORKSHOP: Methodological Workshop on Biodiversity Dynamics
# THURSDAY, May 26, 2011: 9am - 1pm
# http://botany.si.edu/events/2011_pacific/program.htm
########################################################

########################################################
# SETUP HINTS
# 
# NOTE: before running this script,
#    you have to "source" my utility scripts,
#    which contain special functions I have written.
#    
#    The "source" command make the scripts accessible
#    by R.
# 
#    You can download the scripts and source them
#    locally if you like.  This means you have to 
#    put the correct directory into "sourcedir"
#    (I use /_njm/ on my computer, but you should 
#     make the directory wherever you downloaded 
#     the scripts to.).
#
#    Alternatively, you can source the versions I 
#    have put on the web -- but these will only 
#    work as long as they are online on this 
#    Berkeley course website, which might not be
#    up forever.
########################################################

# Local sourcing of Nick's utility scripts
#===================================================
sourcedir = '/_njm/'
source3 = '_genericR_v1.R'
source(paste(sourcedir, source3, sep=""))

# for e.g. calc_loglike
sourcedir = '/_njm/'
source3 = '_R_tree_functions_v1.R'
source(paste(sourcedir, source3, sep=""))

sourcedir = "/_njm/"
source8 = '_matrix_utils_v1.R'
source(paste(sourcedir, source8, sep=""))
#===================================================

# Remote sourcing of Nick's utility scripts
#===================================================
source3 = 'http://ib.berkeley.edu/courses/ib200b/labs/_code/_genericR_v1.R'
source(source3)

source3 = 'http://ib.berkeley.edu/courses/ib200b/labs/_code/_R_tree_functions_v1.R'
source(source3)

source3 = 'http://ib.berkeley.edu/courses/ib200b/labs/_code/_matrix_utils_v1.R'
source(source3)
#===================================================

# "Library" makes these R packages usable in your
# R session.

# If you haven't installed them before, run:
# install.packages("ape")
# install.packages("geiger")
library(ape)
library(geiger)

#library(phangorn)

# Simulate a random tree with 100 taxa
# (a coalescent tree isn't really appropriate
#  for a phylogeny of multiple species
#  but this is just an example)
tr = rcoal(100)

# Make a transition matrix for each variable of interest
# (Here, dispersal rate between geographic regions,
#  between climate zones, and between elevation zones)

# rate of dispersal
d = 0.2
dnum = 2
dmat = matrix(data=c(1, d, d, 1), nrow=dnum, ncol=dnum)
dmatnames = matrix(data=c("1", "d", "d", "1"), nrow=dnum, ncol=dnum)

# rate of elevation switching
e = 0.04
enum = 2
emat = matrix(data=c(1, e, e, 1), nrow=enum, ncol=enum)
ematnames = matrix(data=c("1", "e", "e", "1"), nrow=enum, ncol=enum)

# rate of dry/wet switching
c = 0.02
cnum = 2
cmat = matrix(data=c(1, c, c, 1), nrow=cnum, ncol=cnum)
cmatnames = matrix(data=c("1", "c", "c", "1"), nrow=cnum, ncol=cnum)

d_orig = d
e_orig = e
c_orig = c

dmat2 = matrix(c(1, d, d, d, 1, d, d, d, 1), nrow=3, ncol=3)
emat2 = emat
cmat2 = cmat

list_of_matrices = list(dmat, emat, cmat)
list_of_matrices_w_variable_names = list(dmatnames, ematnames, cmatnames)
tmpmat = combine_matrices_into_supermatrix(list_of_matrices)
tmpmat_names = combine_matrices_into_supermatrix_w_names(list_of_matrices_w_variable_names)

# Normalize the transition matrix so that
# rows sum to 0
(tmpmat2 = normat(tmpmat))

# lengthen edge lengths so that you 
# get about the the right amount of change
tr$edge.length = tr$edge.length *20

# Simulate a history of changes in
# range, climate, and elevation
# using sim.char
#
tmpmat3 = mat_to_Qmat_as_list(tmpmat2)
out = sim.char(tr, model.matrix=tmpmat3, nsims=1, model="discrete", root.state=1) 

(length(unique(out)))

uniq_states = unique(c(tmpmat2))
index_rates_tmpmat2 = tmpmat2
for (j in 1:length(uniq_states))
    {
    index_rates_tmpmat2[tmpmat2 == uniq_states[j]] = j-1
    }

# Do ancestral character estimation and calculate a log-likelihood
# using an equal-rates (ER) transition matrix
res = ace(out, tr, method="ML", type="discrete", CI=TRUE, model="ER")
#res = ace(out, tr, method="ML", type="discrete", CI=FALSE, model=index_rates_tmpmat2, ip=0.1)

(sum(log(res$lik.anc)))

(ll = calc_loglike(out, tr, tmpmat2))

# This is a pretty poor fit.  Can we do better?

###################################################
# A primitive ML (maximum likelihood) search
###################################################

# DO LOG LIKELIHOOD MANUALLY
# BY GOING THROUGH PARAMETERS STEP-BY-STEP
# AND CALCULATING THE LIKELIHOOD FOR EACH

# Set up the parameter values to try during the 
# gridded search
rates_array = array(data=1, dim=c(10,10,10))
dtry = c(0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1)
etry = c(0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1)
ctry = c(0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1)

for (i in 1:length(dtry))
    {
    rates_array[i, , ] = rates_array[i, , ] * dtry[i]
    }
for (i in 1:length(etry))
    {
    rates_array[, i, ] = rates_array[i, , ] * etry[i]
    }
for (i in 1:length(ctry))
    {
    rates_array[, , i] = rates_array[i, , ] * ctry[i]
    }

# do ll (log-likelihood) for each
ll_array = array(data=1, dim=c(10,10,10))
pos_array = array(data=NA, dim=c(10,10,10))

# Gridded search
matrix_dim = get_bigmatrix_dim(list_of_matrices)
for (i in 1:length(dtry))
    {
    for (j in 1:length(etry))
        {
        for (k in 1:length(ctry))
            {
            d = dtry[i]
            e = dtry[j]
            c = dtry[k]

            tmpmat_z1 = matrix(NA, nrow=matrix_dim, ncol=matrix_dim)
            for (z in 1:length(tmpmat_z1))
                {
                cmdstr = paste("tmpmat_z1[z] = ", tmpmat_names[z], sep="")
                eval(parse(text=cmdstr))
                }
            tmpmat_z1 = normat(tmpmat_z1)
            #print(tmpmat_z1)

            ll = calc_loglike(out, tr, tmpmat_z1)

            ll_array[i,j,k] = ll
            tmp = paste(i, j, k, ll, sep=" -- ")
            pos_array[i,j,k] = tmp
            print(tmp)
            }
        }
    }

# Display the distribution of log-likelihood values
# Note: the highest (least-negative) value
# represents the parameters that conferred
# the highest probability on the data
hist(ll_array)
min(ll_array)
(m = max(ll_array))
posm = ll_array == m

# Best values
pos_array[posm]

# Something like these values will come out the best
dtry[6]
etry[4]
ctry[1]

# Original parameter values under which the
# simulation was run
d_orig
e_orig
c_orig
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License