Introduction To R
I am uploading the code for lab here, so that I/we can edit it, add notes, etc., as necessary. Feel free to login and add clarifications etc.!
— Nick
I just uploaded the code, with a few bug fixes, and with answers — Nick, 2/8/11.
##############################################################
# =============================================
# R and phylogenetics, starting from scratch
# =============================================
# by Nick Matzke (and whoever else adds to this PhyloWiki page)
# Copyright 2011-infinity
# matzkeATberkeley.edu
# January 2011
#
# Please link/cite if you use this, email me if you have
# thoughts/improvements/corrections.
#
##############################################################
#
# Free to use/redistribute under:
# Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the above license, linked here:
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
#
# Summary:
#
# You are free:
#
# * to Share — to copy, distribute and transmit the work
# * to Remix — to adapt the work
#
# Under the following conditions:
#
# * Attribution — You must attribute the work in the manner
# specified by the author or licensor (but not in any way that
# suggests that they endorse you or your use of the work).
# * Noncommercial — You may not use this work for commercial purposes.
#
# * Share Alike — If you alter, transform, or build upon this work,
# you may distribute the resulting work only under the same or
# similar license to this one.
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
#
###################################################################
#
# Assignment for Thursday, Feb. 3
# 1. Install R on your own computer
#
# 2. Take this test file and copy/paste in each of the commands
# and see what they do. In my experience, these commands
# cover about 80% of the commands you actually need do
# use R.
#
# 3. Each time you see "#desc:", type YOUR quick description of
# what the function is doing (and what options are being used in the
# command, if needed). Hints:
#
# * "?functionname" brings up the help page for the command
# * in the R GUI, "functionname(" brings up the proper syntax at
# the bottom of the R GUI window
#
# 4. Turn in by PASTING your text file into email to me.
# * Pretty please: start the SUBJECT of the email with: "IB200b_Lab3_Your_Name"
# # Due next lab.
#
# Purpose: This may seem a little tedious, but the idea is that
# (a) you will remember the basic commands better if you've been
# forced to write them down
#
# (b) to get you in the habit of saving ALL your working R commands
# in a text file/script for future reference and future use
#
# (c) producing well-commented code, so you can figure out what you
# did when you return to a project six months later
#
###################################################################
# Stuff that is handy at the beginning of a script
######################################################
# R, close all devices/windows/ quartz windows
graphics.off()
# Turn off all BS stringsAsFactors silliness
# R tends to read
options(stringsAsFactors = FALSE)
######################################################
# Working directories:
# One of the first things you want to do, usually, is
# decide on your working directory
# On my Mac, this is the directory
wd = "/Users/nick/Desktop/_ib200a/ib200b_sp2011/lab03"
# On a PC, you might have to specify paths like this:
#wd = "c:\\Users\\nick\\Desktop\\_ib200a\\ib200b_sp2011\\lab03"
#desc: set working directory
setwd(wd)
#desc: get working directory
getwd()
#desc: list the files in the directory
list.files()
# Sourcing scripts
# Local sourcing:
# source a script (windows machines require \\ instead of /)
# sourcedir = '/_njm/'
#source3 = '_genericR_v1.R'
#source(paste(sourcedir, source3, sep=""))
# You need the APE package to source genericR_v1.R scripts
install.packages("ape")
library(ape)
# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_genericR_v1.R")
#desc: concatentate to a list with c()
student_names = c("Nick", "Tom", "Bob")
# describe what the function "c" does:
#desc:
grade1 = c(37, 100, 60)
grade2 = c(43, 80, 70)
grade3 = c(100, 90, 100)
grade1
grade2
grade3
print(grade3)
#desc: column bind (cbind)
temp_table = cbind(student_names, grade1, grade2, grade3)
#desc: convert to data frame
grade_data = as.data.frame(temp_table)
# Try the same command this way. What happens differently?
#desc: convert to data frame, print to screen
(grade_data = as.data.frame(temp_table))
col_headers = c("names", "test1", "test2", "test3")
#desc: add column heading
names(grade_data) = col_headers
print(grade_data)
# change the column names back
old_names = c("student_names", "grade1", "grade2", "grade3")
names(grade_data) = old_names
# R can be very annoying in certain situations, e.g. treating numbers as character data
# What does as.numeric do?
#desc:
grade_data$grade1 = as.numeric(grade_data$grade1)
grade_data$grade2 = as.numeric(grade_data$grade2)
grade_data$grade3 = as.numeric(grade_data$grade3)
(grade_data)
# If you close R, reopen it, and re-run all of the commands above,
# except leave out "options(stringsAsFactors = FALSE)",
# you will see the bad thing which results.
# I got so annoyed at R's tendency to convert data intended as
# numeric (numbers) or text (character) into factors (categorical)
# that I wrote a function specificially to check the "class"
# of each column:
# summary of an object's class, attributes, dimensions, length,
# and class of each column, if it has columns
summ(grade_data)
# Print just the classes of the columns in a data frame
cls.df(grade_data)
# I also wrote functions to convert all columns classed as factors
# into numbers or characters:
dfnums_to_numeric(grade_data)
# calculate means, etc.
#desc: mean of one column
mean(grade_data$grade1)
#desc: apply the mean function over the rows, for just the numbers columns (2, 3, and 4)
apply(grade_data[ , 2:4], 1, mean)
# Other summary statistics
mean(grade_data)
# What caused the warning message in mean(grade_data)?
#desc:
sum(grade_data$grade1)
median(grade_data$grade1)
#desc: standard deviation
apply(grade_data[ , 2:4], 1, sd)
# store st. dev and multiply by 2
mean_values = apply(grade_data[ , 2:4], 1, mean)
sd_values = apply(grade_data[ , 2:4], 1, sd)
2 * sd_values
# print to screen even within a function:
print(sd_values)
#desc: row bind (rbind)
grade_data2 = rbind(grade_data, c("means", mean_values), c("stds", sd_values))
#############################################
# install a package (only have to do once)
#############################################
# Type this:
odd(13)
# What happened?
#desc:
# Now do this:
install.packages("gtools")
# gtools contains functions for odd/even (and many other things)
# after a package is installed, you have to library() it to use its
# functions during an R session
library(gtools)
# Now type this:
odd(13)
# For loops
# Here, we are using for loops, if statements, and the gtools function "odd"
# What does the code in this for loop do?
#desc:
for (i in 1:10)
{
if (odd(i) == TRUE)
{
print(paste(i, "is odd!", sep=" "))
}
else
{
print("Blah!")
}
}
#desc:
paste("This", "is", "fun", sep=" ")
# print can be annoying, use cat
for (i in 1:10)
{
if (odd(i) == TRUE)
{
cat(paste(i, "is odd!", "\n", sep=" "))
}
else
{
cat("Blah!\n" )
}
}
# How to make your own function
# (These can be sources from a source script, which is
# a .R file either on your computer
get_square <- function(x)
{
output = x^2
return(output)
}
x = 4
#desc:
(newval = get_square(x))
#desc: Write to tab-delimited text file
fn = "grade_data.txt"
write.table(grade_data, file=fn, quote=FALSE, sep="\t", row.names=TRUE, col.names=TRUE)
# read it back in
new_data = read.table(fn, header=TRUE, sep="\t", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
# plots and stats
#desc:
plot(new_data$grade1, new_data$grade2)
#desc:
title("Hi Tom!")
#desc:
plot(new_data$grade1, new_data$grade2, xlab="scores for grade #1", ylab="scores for grade #2")
#desc:
lines(new_data$grade1, new_data$grade2)
#desc:
pairs(new_data[, 2:4])
#desc:
cor(new_data[, 2:4])
# A function I stole from somewhere and put in genericR_v1:
pairs_with_hist(new_data[, 2:4])
# CTRL-left or right to arrow through plots
# help on any function
?mean
?std
# once you are looking at the help page, go to the bottom & click index to see all the options for the package
# or just e.g.
?gtools
# search for text in help
# (marginally useful)
??histogram
# I like to google: ' "r-help" something something '
# ...since someone has always asked my question on the r-help listserv
###############################################
# Basic crash course in APE
# The R Package APE: Analysis of Phylogenetics
# and Evolution
#
# Paradis's book on APE is linked from the
# course website:
# http://ib.berkeley.edu/courses/ib200b/IB200B_SyllabusHandouts.shtml
# (for Feb. 3)
###############################################
# Install APE
install.packages("ape")
# (This should install some other needed packages also)
library(ape)
# This is what a Newick string looks like:
newick_str = "(((Humans, Chimps), Gorillas), Orangs);"
tr = read.tree(text=newick_str)
plot(tr)
# What is the data class of "tr"?
#desc:
class(tr)
# Is there any difference in the graphic produced by these two commands?
#desc:
plot(tr)
plot.phylo(tr)
# What is the difference in the result of these two help commands?
#desc:
?plot
?plot.phylo
# What are we adding to the tree and the plot of the tree, this time?
#desc:
newick_str = "(((Humans:6.0, Chimps:6.0):1.0, Gorillas:7.0):1.0, Orangs:8.0):1.0;"
tr = read.tree(text=newick_str)
plot(tr)
# What are we adding to the tree and the plot of the tree, this time?
#desc:
newick_str = "(((Humans:6.0, Chimps:6.0)LCA_humans_chimps:1.0, Gorillas:7.0)LCA_w_gorillas:1.0, Orangs:8.0)LCA_w_orangs:1.0;"
tr = read.tree(text=newick_str)
plot(tr, show.node.label=TRUE)
# More on Newick format, which, annoyingly, is sometimes inconsistent:
# http://en.wikipedia.org/wiki/Newick_format
# Have a look at how the tree is stored in R
tr
#desc:
tr$tip.label
#desc:
tr$edge
#desc:
tr$edge.length
#desc:
tr$node.label
# If you forget how to find these, you can use the "attributes" function
#desc:
attributes(tr)
# Or my "summ" function
summ(tr)
# However, I got really tired of typing all that stuff, and
# trying to remember which branch points to what node (and the nodes
# are only implicitly encoded in R anyway, not explicitly!), so I wrote
# the function "prt" to print the tree structure to screen all at once:
# Local sourcing:
#sourcedir = '/_njm/'
#source3 = '_R_tree_functions_v1.R'
#source(paste(sourcedir, source3, sep=""))
# Remote sourcing:
source("http://ib.berkeley.edu/courses/ib200b/scripts/_R_tree_functions_v1.R")
prt(tr)
# What information is given in each column of the output from "prt"?
#node:
#ord_ndname:
#node_lvl:
#node.type:
#parent_br:
#edge.length:
#ancestor:
#daughter_nds:
#node_ht:
#time_bp:
#label:
# Now plot the tree in different ways:
# (CTRL-right or CTRL-left to flip between the trees in the graphics window)
plot(tr, type="phylogram")
#desc:
plot(tr, type="phylogram", direction="rightwards")
plot(tr, type="phylogram", direction="leftwards")
plot(tr, type="phylogram", direction="upwards")
plot(tr, type="phylogram", direction="downwards")
#desc:
plot(tr, type="cladogram")
plot(tr, type="fan")
plot(tr, type="unrooted")
plot(tr, type="radial")
#desc:
plot(tr, type="unrooted", edge.width=5)
plot(tr, type="unrooted", edge.width=5, edge.color="blue")
plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="horizontal")
plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="axial")
# In R GUI, you can save any displayed tree to PDF, or do a screen capture etc.
# you can also save a tree to PDF as follows:
pdffn = "homstree.pdf"
pdf(file=pdffn)
plot(tr, type="unrooted", edge.width=5, edge.color="blue", lab4ut="axial")
dev.off()
# In Macs (and maybe PCs), this will open the PDF from R:
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)
# How to save the tree as text files
#desc:
newick_fn = "homstree.newick"
write.tree(tr, file=newick_fn)
# moref is another function I wrote, to work vaguely like the UNIX "more" command
#desc:
moref(newick_fn)
#desc:
nexus_fn = "homstree.nexus"
write.nexus(tr, file=nexus_fn)
moref(nexus_fn)
# To conclude the lab, I wanted to find, download, and display
# a "tree of life".
#
# To do this, I went to the TreeBase search page:
# http://www.treebase.org/treebase-web/search/studySearch.html
#
# ...and searched on studies with the title "tree of life"
#
# Annoyingly, the fairly famous tree from:
#
# Ciccarelli F.D. et al. (2006). "Toward automatic reconstruction of
# a highly resolved tree of life." Science, 311:1283-1287.
# http://www.sciencemag.org/content/311/5765/1283.abstract
#
# ...was not online, as far as I could tell. And a lot of these are the "turtle trees of life", etc.
# Lame. But this one was a tree covering the root of known
# cellular life.
#
# Caetano-anollés G. et al. (2002). "Evolved RNA secondary structure
# and the rooting of the universal tree of life." Journal of
# Molecular Evolution.
#
# Check S796 for this study, then click over to the "Trees" tab to get the
# tree...
#
# http://www.phylowidget.org/full/?tree=%27http://www.treebase.org/treebase-web/tree_for_phylowidget/TB2:Tr3931%27
#
# Or, download the tree from our website, here:
# http://ib.berkeley.edu/courses/ib200b/labs/Caetano-anolles_2002_JME_ToL.newick
# load the tree and play with it:
newick_fn = "Caetano-anolles_2002_JME_ToL.newick"
tree_of_life = read.tree(newick_fn)
plot(tree_of_life, type="cladogram")
plot(tree_of_life, type="phylogram")
plot(tree_of_life, type="unrooted", lab4ut="axial")
# aw, no branch lengths in TreeBase! Topology only! Lame!
# Students ignore this:
# Make the version of this code for handout (no answers)
source3 = '_genericR_v1.R'
source(paste(sourcedir, source3, sep=""))
newfn = replace_each_matching_line("_Nicks_intro_to_R_v3.R", "#desc:", "#desc:")
moref(newfn)