pheatmap_advanced - Coding for Life Sciences

Case 1

The datasets were provided by data-to-viz

library(tidyverse)
library(pheatmap)
library(ggplot2)
library(viridis)
library(kableExtra)
### dataset 1
data <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/13_AdjacencyDirectedWeighted.csv", header=TRUE)
# show data
data %>% head(3) %>% select(1:3) %>% kable() %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	Africa	East.Asia	Europe
Africa	3.142471	0.000000	2.107883
East Asia	0.000000	1.630997	0.601265
Europe	0.000000	0.000000	2.401476

### the following function were embeded in pheatmap source code
scale_rows = function(x){
    m = apply(x, 1, mean, na.rm = T)
    s = apply(x, 1, sd, na.rm = T)
    return((x - m) / s)
}

scale_mat = function(mat, scale){
    if(!(scale %in% c("none", "row", "column"))){
        stop("scale argument shoud take values: 'none', 'row' or 'column'")
    }
    mat = switch(scale, none = mat, row = scale_rows(mat), column = t(scale_rows(t(mat))))
    return(mat)
}

generate_breaks = function(x, n, center = F){
    if(center){
        m = max(abs(c(min(x, na.rm = T), max(x, na.rm = T))))
        res = seq(-m, m, length.out = n + 1)
    }
    else{
        res = seq(min(x, na.rm = T), max(x, na.rm = T), length.out = n + 1)
    }
    
    return(res)
}


data.plot <- scale_mat(mat = data,scale = "column")
breaks <- generate_breaks(data.plot,n = 256,center = F)

pheatmap::pheatmap(mat = data.plot,
                   cluster_cols = F,
                   cluster_rows = F,
                   scale = "column",border_color = "white",
                   color = viridis(n = 256, alpha = 1, 
                                   begin = 0, end = 1, option = "viridis"),
                   breaks = breaks)

case 2

the codes were adapted from slowkow Sort dendrogram is very important

set.seed(42)
random_string <- function(n) {
  substr(paste(sample(letters), collapse = ""), 1, n)
}

mat <- matrix(rgamma(1000, shape = 1) * 5, ncol = 50)

colnames(mat) <- paste(
  rep(1:3, each = ncol(mat) / 3),
  replicate(ncol(mat), random_string(5)),
  sep = ""
)
rownames(mat) <- replicate(nrow(mat), random_string(3))

mat %>% as.data.frame %>% head(3) %>% select(1:3) %>% kable() %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	1jrqxa	1pskvw	1ojvwz
abv	9.6964789	9.172811	2.827695
nft	0.9020955	15.575853	4.328376
xha	2.6721643	3.127039	1.765077

split data into 3 groups, and increase the values in group1

col_groups <- substr(colnames(mat), 1, 1)
mat[,col_groups == "1"] <- mat[,col_groups == "1"] * 5

making the heatmap

# install.packages("pheatmap", "RColorBrewer", "viridis")
library(pheatmap)
library(RColorBrewer)
library(viridis)

# Data frame with column annotations.
mat_col <- data.frame(group = col_groups)
rownames(mat_col) <- colnames(mat)

# List with colors for each annotation.
mat_colors <- list(group = brewer.pal(3, "Set1"))
names(mat_colors$group) <- unique(col_groups)

pheatmap(
  mat               = mat,
  color             = inferno(10),
  border_color      = NA,
  show_colnames     = FALSE,
  show_rownames     = FALSE,
  annotation_col    = mat_col,
  annotation_colors = mat_colors,
  drop_levels       = TRUE,
  fontsize          = 14,
  main              = "Default Heatmap"
)

The default color breaks in pheatmap are uniformly distributed across the range of the data.

We can see that values in group 1 are larger than values in groups 2 and 3. However, we can’t distinguish different values within groups 2 and 3.

## ----uniform-color-breaks------------------------------------------------

mat_breaks <- seq(min(mat), max(mat), length.out = 10)

dat <- data.frame(values = as.numeric(mat))

## ----uniform-color-breaks-detail, fig.height=2, echo=FALSE---------------
dat_colors <- data.frame(
  xmin = mat_breaks[1:(length(mat_breaks)-1)],
  xmax = mat_breaks[2:length(mat_breaks)],
  ymin = 0,
  ymax = max(density(mat, bw = "SJ")$y),
  fill = rev(inferno(length(mat_breaks) - 1)),
  stringsAsFactors = FALSE
)
ggplot() +
  geom_rect(
    data = dat_colors,
    mapping = aes(
      xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill
    )
  ) +
  geom_density(
    data = dat,
    mapping = aes(values),
    bw = "SJ", color = "cyan"
  ) +
  scale_fill_manual(values = dat_colors$fill) +
  cowplot::theme_cowplot()+
  theme(legend.position = "none") +
  labs(title = "Uniform breaks")

there are 6 data points greater than or equal to 100 are represented with 4 different colors.

dat2 <- as.data.frame(table(cut(
  mat, mat_breaks
)))
dat2$fill <- inferno(nrow(dat2))
ggplot() +
  geom_bar(
    data = dat2,
    mapping = aes(x = Var1, weight = Freq, fill = Var1),
    color = "black", size = 0.1
  ) +
  coord_flip() +
  scale_fill_manual(values = dat2$fill) +
  cowplot::theme_cowplot()+
  theme(legend.position = "none") +
  labs(y = "data points", x = "breaks",
       title = "Number of data points per color")

If we reposition the breaks at the quantiles of the data, then each color will represent an equal proportion of the data:

quantile_breaks <- function(xs, n = 10) {
  breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
  breaks[!duplicated(breaks)]
}

mat_breaks <- quantile_breaks(mat, n = 11)

lets see

dat_colors <- data.frame(
  xmin = mat_breaks[1:(length(mat_breaks)-1)],
  xmax = mat_breaks[2:length(mat_breaks)],
  ymin = 0,
  ymax = max(density(mat, bw = "SJ")$y),
  fill = rev(inferno(length(mat_breaks) - 1)),
  stringsAsFactors = FALSE
)
ggplot() +
  geom_rect(
    data = dat_colors,
    mapping = aes(
      xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill
    )
  ) +
  geom_density(
    data = dat,
    mapping = aes(values),
    bw = "SJ", color = "cyan"
  ) +
  scale_fill_manual(values = dat_colors$fill) +
  theme(legend.position = "none") +
  labs(title = "Quantile breaks")

dat2 <- as.data.frame(table(cut(
  mat, mat_breaks
)))
dat2$fill <- inferno(nrow(dat2))
ggplot() +
  geom_bar(
    data = dat2,
    mapping = aes(x = Var1, weight = Freq, fill = Var1),
    color = "black", size = 0.1
  ) +
  coord_flip() +
  scale_fill_manual(values = dat2$fill) +
  theme(legend.position = "none") +
  labs(y = "data points", x = "breaks",
       title = "Number of data points per color")

When we use quantile breaks in the heatmap, we can clearly see that group 1 values are much larger than values in groups 2 and 3, and we can also distinguish different values within groups 2 and 3:

pheatmap(
  mat               = mat,
  color             = inferno(length(mat_breaks) - 1),
  breaks            = mat_breaks,
  border_color      = NA,
  show_colnames     = FALSE,
  show_rownames     = FALSE,
  annotation_col    = mat_col,
  annotation_colors = mat_colors,
  drop_levels       = TRUE,
  fontsize          = 14,
  main              = "Quantile Color Scale"
)

We can also transform data

pheatmap(
  mat               = log10(mat),
  color             = inferno(10),
  border_color      = NA,
  show_colnames     = FALSE,
  show_rownames     = FALSE,
  annotation_col    = mat_col,
  annotation_colors = mat_colors,
  drop_levels       = TRUE,
  fontsize          = 14,
  main              = "Log10 Transformed Values"
)

sort dendrograms

library(dendsort)

mat_cluster_cols <- hclust(dist(t(mat)))


sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))

mat_cluster_cols <- sort_hclust(mat_cluster_cols)
plot(mat_cluster_cols, main = "Sorted Dendrogram", xlab = "", sub = "")

sort Dendrogram heatmap

mat_cluster_rows <- sort_hclust(hclust(dist(mat)))
pheatmap(
  mat               = mat,
  color             = inferno(length(mat_breaks) - 1),
  breaks            = mat_breaks,
  border_color      = NA,
  cluster_cols      = mat_cluster_cols,
  cluster_rows      = mat_cluster_rows,
  show_colnames     = FALSE,
  show_rownames     = FALSE,
  annotation_col    = mat_col,
  annotation_colors = mat_colors,
  drop_levels       = TRUE,
  fontsize          = 14,
  main              = "Sorted Dendrograms"
)

change colnames angle

pheatmap(
  mat               = mat,
  color             = inferno(length(mat_breaks) - 1),
  breaks            = mat_breaks,
  border_color      = NA,
  cluster_cols      = mat_cluster_cols,
  cluster_rows      = mat_cluster_rows,
  show_colnames     = TRUE,
  show_rownames     = FALSE,
  annotation_col    = mat_col,
  angle_col = 90,
  fontsize_col  = 8,
  annotation_colors = mat_colors,
  drop_levels       = TRUE,
  fontsize          = 10,
  main              = "Sorted Dendrograms"
)