-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_tree.rmd
More file actions
371 lines (315 loc) · 11.3 KB
/
plot_tree.rmd
File metadata and controls
371 lines (315 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
---
title: "Plot a phylogenetic tree"
output: html_document
---
This document describes the process of plotting a phylogenetic tree
Initially, I was having trouble reading the complex NEXUS format from IQ-TREE (for concordance factors)
The function `read.beast()` from package `treeio` associated with package `ggtree` fails
To get around this, I have written a short python script `concord_to_newick.py`
Pass the complicated concordance Nexus tree to that script, then use the output for plotting with this script
The input tree should be in Newick format and can include node labels
Files are expected to be in the current working directory (set with `setwd("")`)
The outgroup file has a single tip label per line
The clade file has a search term per line (or multiple, tab separated) unique for taxa in a clade for collapsing
Load libraries and read input
```{r}
suppressMessages(library(ape))
suppressMessages(library(phytools))
treefile <- "phylo.treefile"
concord_treefile <- "concord_scf.tre"
outgroup_file <- "outgroup.txt"
clade_file <- "clades.txt"
```
Read in the clades file if present
```{r}
if (file.exists(clade_file)) {
clades_present <- TRUE
clade_lines <- vector(mode = "list")
index <- 1
con <- file(clade_file, "r")
while (TRUE) {
line <- readLines(con, n = 1)
if (length(line) == 0) {
break
}
clade_lines[index] <- line
index <- index + 1
}
close(con)
} else {
clades_present <- FALSE
}
```
Read in the tree file
```{r}
tree <- read.tree(treefile)
```
If there is an outgroup specified, root the tree on the outgroup
```{r}
if (file.exists(outgroup_file)) {
outgroups <- read.table(outgroup_file)[, 1]
these_outgroups <- outgroups[outgroups %in% tree$tip.label]
rootnode <- getMRCA(tree, as.character(these_outgroups))
position <- 0.5 * tree$edge.length[which(tree$edge[, 2] == rootnode)]
rooted_tree <- reroot(tree, rootnode, position, edgelabel = TRUE)
tree <- rooted_tree
}
```
If using site concordance factors, capture them as a dataframe
(This works if captured with my `concord_to_newick.py` script: "sCF/sDF1/sDF2")
```{r}
if (file.exists(concord_treefile)) {
concord_tree <- read.tree(concord_treefile)
# if an outroup is present, root this tree too
if (file.exists(outgroup_file)) {
outgroups <- read.table(outgroup_file)[, 1]
these_outgroups <- outgroups[outgroups %in% concord_tree$tip.label]
rootnode <- getMRCA(concord_tree, as.character(these_outgroups))
position <- 0.5 * concord_tree$edge.length[which(concord_tree$edge[, 2] == rootnode)]
rooted_tree <- reroot(concord_tree, rootnode, position, edgelabel = TRUE)
concord_tree <- rooted_tree
}
concord_df <- data.frame(matrix(ncol = 3, nrow = concord_tree$Nnode))
index <- 1
for (label in concord_tree$node.label) {
if (any(label == "", label == "Root")) {
values <- c("", "", "")
} else {
values <- strsplit(label, split = "\\/")[[1]]
}
concord_df[index, 1:3] <- values
index <- index + 1
}
concord_df <- as.data.frame(lapply(concord_df, as.numeric))
}
```
**Tree Plotting**
1) PDF if wanting a simple tree
```{r}
pdf("tree.pdf", width = 16, height = 28, family = "ArialMT")
```
Option 1: plot the tree as a cladogram
```{r}
plot.phylo(ladderize(tree, right = FALSE),
no.margin = TRUE,
use.edge.length = FALSE,
node.depth = 2,
font = 1,
edge.width = 3)
```
Option 2: plot the tree with edge lengths
```{r}
plot.phylo(ladderize(tree, right = FALSE),
no.margin = TRUE,
font = 1,
edge.width = 3)
add.scale.bar(x = mean(par("usr")[1:2]), y = par("usr")[3] + 1, font = 1, lwd = 3)
```
If desired, plot node labels (e.g. bootstrap support) on edges
```{r}
drawSupportOnEdges(tree$node.label, adj = c(0.5, -0.5), frame = "none")
```
Another option is to plot node labels on nodes
```{r}
nodelabels(tree$node.label, adj = c(-0.05, 0.5), frame = "none")
```
If desired and present, plot site concordance factors as pie charts on nodes
```{r}
# to catch if there are any all 0
concord_df[which(concord_df[, 1] == 0 & concord_df[, 2] == 0 & concord_df[, 3] == 0), ] <- c(NA, NA, NA)
nodelabels(pie = concord_df,
piecol = c("white", "grey", "black"),
cex = 0.5)
```
Stop creating the PDF
```{r}
invisible(dev.off())
```
2) PDF of zoom into the ingroup
```{r}
pdf("tree_zoom.pdf", width = 11, height = 22, family = "ArialMT")
```
Create the zoom (assuming the labels are clear/unique for the outgroup)
```{r}
zoom(ladderize(tree, right = FALSE),
grep(paste(outgroups, collapse = "|"), tree$tip.label, invert = TRUE),
subtree = FALSE,
col = "black",
no.margin = TRUE,
font = 1,
edge.width = 3)
add.scale.bar(x = mean(par("usr")[1:2]), y = par("usr")[3] + 1, font = 1, lwd = 3)
```
For zooming, it is not clear how to adjust the node labels to display support
A new plot is made that changes the node numbers to sequential from the order going in
This method appears to work
```{r}
# first, determine the edge labels of the tree that is being zoomed into
old_edges <- ladderize(tree, right = FALSE)$edge
# next, find the node number of the MRCA for the ingroup
mrca_node <- getMRCA(tree, grep(paste(outgroups, collapse = "|"), tree$tip.label, invert = TRUE))
# the first time that node shows up in the edges matrix will be the start point for the zoomed tree
index <- grep(mrca_node, old_edges[, 2])
partner_edges <- old_edges[, 2][(index + 1): length(old_edges[, 2])]
# now take those numbers (numbers < Ntip(tree) are tips) and match node labels
# this should be the order of node labels in the zoomed tree
edge_nums <- partner_edges[partner_edges > Ntip(tree)]
new_node_labels <- tree$node.label[edge_nums - Ntip(tree)]
# plot support on edges
drawSupportOnEdges(new_node_labels, adj = c(0.5, -0.5), frame = "none")
```
Stop creating the PDF
```{r}
invisible(dev.off())
```
3) Collapsing clades
Start an SVG
```{r}
svg("tree.svg", width = 11, height = 22)
```
Perhaps modify N. Cusimano's function (https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/mapping.R)?
See also: https://stackoverflow.com/a/34405198
First, determine the formatting for the plot and plot it (set up coordinates)
Define a function to display the tree that will have collapsed clades
Option 1: Cladogram (no branch lengths)
```{r}
collapse_tree <- function(phylo, ...) {
plot.phylo(ladderize(phylo, right = FALSE),
no.margin = TRUE,
use.edge.length = FALSE,
node.depth = 2,
font = 1,
...)
}
```
Option 2: Phylogram (with meaningful branch lengths)
```{r}
collapse_tree <- function(phylo, ...) {
plot.phylo(ladderize(phylo, right = FALSE),
no.margin = TRUE,
font = 1,
...)
}
```
Option 3: Phylogram, but reducing the impact of long edges (adjust num_edges to fit your tree's long branches)
```{r}
num_edges <- 3
collapse_tree <- function(phylo, ...) {
plotBreakLongEdges(ladderize(phylo, right = FALSE),
no.margin = TRUE,
font = 1,
n = num_edges,
...)
}
```
Now run the plotting and collapsing of clades
```{r}
# check if there are clades present
if (! clades_present) {
stop(print("There needs to be a clades file specified to collapse clades\n"), call. = FALSE)
}
# plot the tree to get a coordinate system
collapse_tree(tree)
# determine triangle coordinates and edges and tips to drop for clades
tri_coords <- vector(mode = "list", length = length(clade_lines))
tips_drop <- vector(mode = "list", length = length(clade_lines))
edges_drop <- vector(mode = "list", length = length(clade_lines))
labels <- vector(mode = "list", length = length(clade_lines))
root_node <- length(tree$tip.label) + 1
for (index in seq_len(length(clade_lines))) {
# extract the search terms and determine the taxa and label
taxa <- vector(mode = "character")
terms <- strsplit(clade_lines[[index]], split = "\t")[[1]]
for (term in terms) {
hits <- tree$tip.label[grep(term, tree$tip.label)]
taxa <- c(taxa, hits)
}
labels[index] <- paste(terms, collapse = "")
# find most recent common ancestor
mrca_node <- getMRCA(tree, taxa) # node number
# determine the tips that comprise the clade and their plotting numbers
leaves <- extract.clade(tree, mrca_node)$tip.label
tips_drop[index] <- list(leaves)
leaf_nodes <- match(leaves, tree$tip.label)
# determine the edges that need to be dropped
edges <- which.edge(tree, leaves)
edges_drop[index] <- list(edges)
# determine the clade internal node indices and remove labels
# but NOT the node at the base of the clade
mrca_index <- mrca_node - Ntip(tree)
edge_index <- unique(tree$edge[edges, 1]) - Ntip(tree)
edge_index <- setdiff(edge_index, mrca_index)
tree$node.label[edge_index] <- ""
# determine the coordinate for the mrca node in the plot
lastplot <- get("last_plot.phylo", envir = .PlotPhyloEnv)
xcoord1 <- lastplot$xx[mrca_node]
ycoord1 <- lastplot$yy[mrca_node]
# determine the coordinates for the base of the triangle
# x is the max of the clade tips
xcoord2 <- max(lastplot$xx[leaf_nodes])
# y coords are the min and max of the clade (traingle may not be even)
ycoord2 <- min(lastplot$yy[leaf_nodes])
ycoord3 <- max(lastplot$yy[leaf_nodes])
# y coords could be the difference between min and max plus/minus from the mrca y
#spread IS max(lastplot$yy[leaf_nodes]) - min(lastplot$yy[leaf_nodes])
#ycoord2 IS ycoord1 - (spread / 2)
#ycoord3 IS ycoord1 + (spread / 2)
# copy the coords
xcoords <- c(xcoord1, xcoord2, xcoord2)
ycoords <- c(ycoord1, ycoord2, ycoord3)
tri_coords[index] <- list(list(xcoords, ycoords))
}
# now plot the tree with the edges and tips of interest removed
mytree <- tree
# set tip labels to be dropped to NA
all_drops <- unlist(tips_drop)
mytree$tip.label[match(all_drops, mytree$tip.label)] <- NA
# determine the edges in the original tree and their order in the ladderized one
# designate the edges as unique combos of column values in the tree$edge table
all_edges <- unlist(edges_drop)
edge_combos <- tree$edge[all_edges, ] # the edge order is specific to the unaltered tree
edges_char <- paste0(as.character(edge_combos[, 1]), "_", as.character(edge_combos[, 2]))
new_edges_char <- paste0(as.character(lastplot$edge[, 1]), "_",
as.character(lastplot$edge[, 2]))
# set the specific edges to have width -1
edge_widths <- rep(3, nrow(mytree$edge))
edge_widths[new_edges_char %in% edges_char] <- -1
collapse_tree(mytree, edge.width = edge_widths)
# add the triangles
for (coordset in tri_coords) {
polygon(coordset[[1]], coordset[[2]], lwd = 3)
}
# add clade labels
for (index in seq_len(length(labels))) {
label <- labels[[index]]
xcoord <- tri_coords[[index]][[1]][3]
ycoord <- mean(tri_coords[[index]][[2]][2: 3])
# alternatively, if using the even spread,
#ycoord IS tri_coords[[index]][[2]][1]
text(xcoord, ycoord, labels = label, pos = 4)
}
```
If the collapsed clades tree is a phylogram, add a scale bar
```{r}
add.scale.bar(x = mean(par("usr")[1:2]), y = par("usr")[3] + 1, font = 1, lwd = 3)
```
If desired, plot node labels (e.g. bootstrap support) on edges
```{r}
drawSupportOnEdges(tree$node.label, adj = c(0.5, -0.5), frame = "none")
```
Another option is to plot node labels on nodes
```{r}
nodelabels(tree$node.label, adj = c(-0.05, 0.5), frame = "none")
```
If desired and present, plot site concordance factors as pie charts on nodes
```{r}
# to catch if there are any all 0
concord_df[which(concord_df[, 1] == 0 & concord_df[, 2] == 0 & concord_df[, 3] == 0), ] <- c(NA, NA, NA)
nodelabels(pie = concord_df,
piecol = c("white", "grey", "black"),
cex = 1)
```
Stop creating the SVG
```{r}
invisible(dev.off())
```