pheatmap_advanced
Case 1
The datasets were provided by data-to-viz
library(tidyverse)
library(pheatmap)
library(ggplot2)
library(viridis)
library(kableExtra)
### dataset 1
data <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/13_AdjacencyDirectedWeighted.csv", header=TRUE)
# show data
data %>% head(3) %>% select(1:3) %>% kable() %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Africa |
East.Asia |
Europe |
|
---|---|---|---|
Africa |
3.142471 |
0.000000 |
2.107883 |
East Asia |
0.000000 |
1.630997 |
0.601265 |
Europe |
0.000000 |
0.000000 |
2.401476 |
### the following function were embeded in pheatmap source code
scale_rows = function(x){
m = apply(x, 1, mean, na.rm = T)
s = apply(x, 1, sd, na.rm = T)
return((x - m) / s)
}
scale_mat = function(mat, scale){
if(!(scale %in% c("none", "row", "column"))){
stop("scale argument shoud take values: 'none', 'row' or 'column'")
}
mat = switch(scale, none = mat, row = scale_rows(mat), column = t(scale_rows(t(mat))))
return(mat)
}
generate_breaks = function(x, n, center = F){
if(center){
m = max(abs(c(min(x, na.rm = T), max(x, na.rm = T))))
res = seq(-m, m, length.out = n + 1)
}
else{
res = seq(min(x, na.rm = T), max(x, na.rm = T), length.out = n + 1)
}
return(res)
}
data.plot <- scale_mat(mat = data,scale = "column")
breaks <- generate_breaks(data.plot,n = 256,center = F)
pheatmap::pheatmap(mat = data.plot,
cluster_cols = F,
cluster_rows = F,
scale = "column",border_color = "white",
color = viridis(n = 256, alpha = 1,
begin = 0, end = 1, option = "viridis"),
breaks = breaks)
case 2
the codes were adapted from slowkow Sort dendrogram is very important
set.seed(42)
random_string <- function(n) {
substr(paste(sample(letters), collapse = ""), 1, n)
}
mat <- matrix(rgamma(1000, shape = 1) * 5, ncol = 50)
colnames(mat) <- paste(
rep(1:3, each = ncol(mat) / 3),
replicate(ncol(mat), random_string(5)),
sep = ""
)
rownames(mat) <- replicate(nrow(mat), random_string(3))
mat %>% as.data.frame %>% head(3) %>% select(1:3) %>% kable() %>%
kable_styling(bootstrap_options = "striped", full_width = F)
1jrqxa |
1pskvw |
1ojvwz |
|
---|---|---|---|
abv |
9.6964789 |
9.172811 |
2.827695 |
nft |
0.9020955 |
15.575853 |
4.328376 |
xha |
2.6721643 |
3.127039 |
1.765077 |
split data into 3 groups, and increase the values in group1
making the heatmap
# install.packages("pheatmap", "RColorBrewer", "viridis")
library(pheatmap)
library(RColorBrewer)
library(viridis)
# Data frame with column annotations.
mat_col <- data.frame(group = col_groups)
rownames(mat_col) <- colnames(mat)
# List with colors for each annotation.
mat_colors <- list(group = brewer.pal(3, "Set1"))
names(mat_colors$group) <- unique(col_groups)
pheatmap(
mat = mat,
color = inferno(10),
border_color = NA,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 14,
main = "Default Heatmap"
)
The default color breaks in pheatmap are uniformly distributed across the range of the data.
We can see that values in group 1 are larger than values in groups 2 and 3. However, we can’t distinguish different values within groups 2 and 3.
## ----uniform-color-breaks------------------------------------------------
mat_breaks <- seq(min(mat), max(mat), length.out = 10)
dat <- data.frame(values = as.numeric(mat))
## ----uniform-color-breaks-detail, fig.height=2, echo=FALSE---------------
dat_colors <- data.frame(
xmin = mat_breaks[1:(length(mat_breaks)-1)],
xmax = mat_breaks[2:length(mat_breaks)],
ymin = 0,
ymax = max(density(mat, bw = "SJ")$y),
fill = rev(inferno(length(mat_breaks) - 1)),
stringsAsFactors = FALSE
)
ggplot() +
geom_rect(
data = dat_colors,
mapping = aes(
xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill
)
) +
geom_density(
data = dat,
mapping = aes(values),
bw = "SJ", color = "cyan"
) +
scale_fill_manual(values = dat_colors$fill) +
cowplot::theme_cowplot()+
theme(legend.position = "none") +
labs(title = "Uniform breaks")
there are 6 data points greater than or equal to 100 are represented with 4 different colors.
dat2 <- as.data.frame(table(cut(
mat, mat_breaks
)))
dat2$fill <- inferno(nrow(dat2))
ggplot() +
geom_bar(
data = dat2,
mapping = aes(x = Var1, weight = Freq, fill = Var1),
color = "black", size = 0.1
) +
coord_flip() +
scale_fill_manual(values = dat2$fill) +
cowplot::theme_cowplot()+
theme(legend.position = "none") +
labs(y = "data points", x = "breaks",
title = "Number of data points per color")
If we reposition the breaks at the quantiles of the data, then each color will represent an equal proportion of the data:
quantile_breaks <- function(xs, n = 10) {
breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
breaks[!duplicated(breaks)]
}
mat_breaks <- quantile_breaks(mat, n = 11)
lets see
dat_colors <- data.frame(
xmin = mat_breaks[1:(length(mat_breaks)-1)],
xmax = mat_breaks[2:length(mat_breaks)],
ymin = 0,
ymax = max(density(mat, bw = "SJ")$y),
fill = rev(inferno(length(mat_breaks) - 1)),
stringsAsFactors = FALSE
)
ggplot() +
geom_rect(
data = dat_colors,
mapping = aes(
xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill
)
) +
geom_density(
data = dat,
mapping = aes(values),
bw = "SJ", color = "cyan"
) +
scale_fill_manual(values = dat_colors$fill) +
theme(legend.position = "none") +
labs(title = "Quantile breaks")
dat2 <- as.data.frame(table(cut(
mat, mat_breaks
)))
dat2$fill <- inferno(nrow(dat2))
ggplot() +
geom_bar(
data = dat2,
mapping = aes(x = Var1, weight = Freq, fill = Var1),
color = "black", size = 0.1
) +
coord_flip() +
scale_fill_manual(values = dat2$fill) +
theme(legend.position = "none") +
labs(y = "data points", x = "breaks",
title = "Number of data points per color")
When we use quantile breaks in the heatmap, we can clearly see that group 1 values are much larger than values in groups 2 and 3, and we can also distinguish different values within groups 2 and 3:
pheatmap(
mat = mat,
color = inferno(length(mat_breaks) - 1),
breaks = mat_breaks,
border_color = NA,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 14,
main = "Quantile Color Scale"
)
We can also transform data
pheatmap(
mat = log10(mat),
color = inferno(10),
border_color = NA,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 14,
main = "Log10 Transformed Values"
)
sort dendrograms
library(dendsort)
mat_cluster_cols <- hclust(dist(t(mat)))
sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
mat_cluster_cols <- sort_hclust(mat_cluster_cols)
plot(mat_cluster_cols, main = "Sorted Dendrogram", xlab = "", sub = "")
sort Dendrogram heatmap
mat_cluster_rows <- sort_hclust(hclust(dist(mat)))
pheatmap(
mat = mat,
color = inferno(length(mat_breaks) - 1),
breaks = mat_breaks,
border_color = NA,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 14,
main = "Sorted Dendrograms"
)
change colnames angle
pheatmap(
mat = mat,
color = inferno(length(mat_breaks) - 1),
breaks = mat_breaks,
border_color = NA,
cluster_cols = mat_cluster_cols,
cluster_rows = mat_cluster_rows,
show_colnames = TRUE,
show_rownames = FALSE,
annotation_col = mat_col,
angle_col = 90,
fontsize_col = 8,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 10,
main = "Sorted Dendrograms"
)