ACM_BCB

Module 2: Epigenetic Clock Analysis

How Old Are You Really? From DNA Methylation to Biological Age

A Hands-On Workshop


Workshop Overview

This hands-on tutorial walks through four activities: **This tutorial requires R and R studio and has been tested on R4.5 and R4.6.** **Please download the [R markdown file and required data zipped file](https://github.com/kobor-lab/ACM_BCB/blob/main/Epigenetic_clock_analysis_zip.zip))** and unzip the files. This directory contain the script [Epigenetic_clock_analysis_RMD.Rmd](https://github.com/kobor-lab/ACM_BCB/blob/main/Epigenetic_clock_analysis_RMD.Rmd) and the data files for running the script.

  1. Activity 1 — DNA methylation exploration and PCA
  2. Activity 2 — Calculation of epigenetic clocks
  3. Activity 3 — Computation and visualization of Epigenetic Age Difference (EAD) and Epigenetic Age Acceleration (EAA)
  4. Activity 4 — Association analyses linking EAA to health-related variables, including Adults with Congenital Heart Disease (ACHD) surgery status with and without Age or cell-type correction

What’s Covered

Out of Scope


Setup: R Packages

Install required packages

# # Run once to install all required packages
packages <- c(
  "knitr",
  "dplyr",
  "tidyr",
  "reshape2",
  "table1",
  "broom",
  "ggplot2",
  "ggpubr",
  "patchwork",
  "pheatmap",
  "MASS",
  "sfsmisc",
  "robustbase"
)

install.packages(packages)

# Bioconductor packages
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

# methylCIPHER and DunedinPACE are GitHub packages
if (!require("remotes", quietly = TRUE))
  install.packages("remotes")

remotes::install_github("HigginsChenLab/methylCIPHER")
remotes::install_github("danbelsky/DunedinPACE")


BiocManager::install("preprocessCore") # for DunedinPACE

Load libraries and helper functions

library(knitr)
library(dplyr)
library(tidyr)
library(reshape2)
library(table1)
library(broom)
library(methylCIPHER)
library(ggplot2)
library(ggpubr)
library(patchwork)
library(DunedinPACE)
library(pheatmap)
library(MASS)
library(sfsmisc)
library(robustbase)

dir.create("results")
# for PCA loading and variable association tests
wrapperPlotPCAR2 <- function(Loadings, Importance,
                                meta_categorical = NULL,
                                meta_continuous  = NULL,
                                P_threshold      = 0.05,
                                p_title          = "",
                                geom.text.size   = 3,
                                round_digit      = 3,
                                show_legend      = TRUE,
                                multiple_test_correction = TRUE) {
  
  library(broom)
  library(dplyr)
  library(ggplot2)
  
  # --- Helper: ANOVA R2 ---
  get_anova_r2 <- function(pc_idx, covar_name) {
    fit   <- aov(Loadings[, pc_idx] ~ meta_categorical[, covar_name])
    fit   <- car::Anova(fit, type = "II")
    tidy_fit <- tidy(fit)
    
    ss_reg <- tidy_fit$sumsq[1]
    ss_res <- tidy_fit$sumsq[2]
    
    data.frame(
      term      = covar_name,
      statistic = tidy_fit$statistic[1],
      PC        = paste0("PC", pc_idx),
      R_squared = ss_reg / (ss_reg + ss_res),
      p.value   = tidy_fit$p.value[1],
      Variation = get_variation(pc_idx),
      method    = "ANOVA"
    )
  }
  
  # --- Helper: Correlation R2 ---
  get_cor_r2 <- function(pc_idx, covar_name) {
    fit <- cor.test(Loadings[, pc_idx], meta_continuous[, covar_name],
                    method = "pearson", alternative = "two.sided")
    data.frame(
      term      = covar_name,
      statistic = fit$statistic,
      PC        = paste0("PC", pc_idx),
      R_squared = fit$estimate^2,
      p.value   = fit$p.value,
      Variation = get_variation(pc_idx),
      method    = "Pearson's product-moment correlation"
    )
  }
  
  # --- Helper: get % variance for a PC ---
  get_variation <- function(pc_idx) {
    round(Importance[pc_idx] * 100, 2)
  }
  
  # --- Run tests ---
  results_cat  <- if (!is.null(meta_categorical)) {
    do.call(rbind, lapply(1:ncol(Loadings), function(i)
      do.call(rbind, lapply(colnames(meta_categorical), function(cv) get_anova_r2(i, cv)))))
  }
  
  results_cont <- if (!is.null(meta_continuous)) {
    do.call(rbind, lapply(1:ncol(Loadings), function(i)
      do.call(rbind, lapply(colnames(meta_continuous), function(cv) get_cor_r2(i, cv)))))
  }
  
  # --- Combine results ---
  all_results <- rbind(results_cat, results_cont)
  all_results$adjP <- p.adjust(all_results$p.value, method = "bonferroni")
  
  variable_level <- rev(c(colnames(meta_categorical), colnames(meta_continuous)))
  
  # --- Significance label ---
  legend_title <- ifelse(multiple_test_correction, "Adjusted p", "Nominal p")
  all_results <- all_results %>%
    mutate(sig_color = ifelse(
      (if (multiple_test_correction) adjP else p.value) <= P_threshold,
      paste0("<=", P_threshold), paste0(">", P_threshold)
    ))
  
  # --- PC axis labels with % variance ---
  all_results$PC <- factor(all_results$PC)
  all_results$term <- factor(all_results$term, levels = variable_level)
  
  pc_var <- all_results %>% dplyr::select(PC, Variation) %>% distinct()
  levels(all_results$PC) <- paste0(pc_var$PC, "\n(", pc_var$Variation, "%)")
  
  # --- Color scale for significant associations---
  sig_colors <- setNames(
    c("#ffc4c4", "lightgrey"),
    c(paste0("<=", P_threshold), paste0(">", P_threshold))
  )
  
  # --- Plot ---
  p <- ggplot(all_results, aes(PC, term, fill = sig_color)) +
    geom_tile(color = "white") +
    geom_text(aes(label = round(R_squared, round_digit)), size = geom.text.size) +
    scale_fill_manual(values = sig_colors) +
    coord_fixed() +
    ggtitle(p_title) +
    guides(fill = guide_legend(title = legend_title)) +
    theme_minimal() +
    theme(
      panel.grid  = element_blank(),
      axis.title  = element_blank(),
      axis.ticks.y = element_blank()
    )
  
  if (!show_legend) p <- p + theme(legend.position = "none")
  
  return(list(plot = p, stats = all_results))
}


# plot Age vs. clock and Calculate Median Absolute Error
plotClockMdAE <- function(SampleInfo, clock_i){
  SampleInfo$EpiAge <- SampleInfo[, clock_i]
  
  mae_val <- median(abs(SampleInfo$EpiAge - SampleInfo$Age)) #median absolute error between clock and chronological age
  
  # Scatter plot
  ggplot(SampleInfo, aes(x = Age, y = EpiAge)) +
    geom_point(alpha = 0.6, color = "steelblue") +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "lightpink") +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray40") +
    stat_cor(method = "pearson", label.x.npc = "left", label.y.npc = "top") +
    annotate(
      "text",
      x = Inf, y = -Inf,
      hjust = 1.1, vjust = -1,
      label = paste0("Median Absolute Error = ", round(mae_val, 2), " years"),
      fontface = "italic",
      size = 4,
      color = "gray20"
    ) +
    labs(
      title = paste0("Chronological Age vs " ,clock_i),
      x     = "Chronological Age (years)",
      y     = "Epigenetic Age (years)"
    ) +
    theme_classic()
}

Activity 1: PCA of DNA Methylation Beta Values

Goal

Apply Principal Component Analysis (PCA) to whole blood DNA methylation beta values to identify the primary biological drivers of DNA methylation variantion, such as chronological age, sex, and estimated immune cell-type proportions. </div>

This activity uses a subset of samples and CpGs from dataset GSE286313 (from Zhuang & Jude et al. 2025). The CpG subset contains clock probes used to calculate the Horvath Pan-Tissue, Hannum, PhenoAge, GrimAge, and DunedinPACE clocks.

Load beta value and Sample metadata

The provided beta matrix has been preprocessed (background correction, functional normalization, batch correction for technical batches (Chip ID and Chip position))

SampleInfo <- readRDS("data/SampleInfo_n44_VHAS_CALERIE.rds")
beta_matrix <- readRDS("data/matrix_x_clock_probes_n44.rds")
dim(beta_matrix) # 41353    44

Explore data

Sample demographics

df <- SampleInfo
df$Sex    <- factor(df$Sex)
df$Cohort <- factor(df$Cohort)

# Optional: add labels
label(df$Age)    <- "Age (years)"
label(df$Sex)    <- "Sex"
label(df$Cohort) <- "Cohort"

# Create Table 1
table1(~ Age + Sex|Cohort, data = df)

table 1

DNA methylation beta value density

Beta_sample_melted<- reshape2::melt(beta_matrix)

Beta_Plot<-Beta_sample_melted[which(Beta_sample_melted$value >= 0),]
colnames(Beta_Plot)[2:3] <- c("Sample_Name", "Beta_Value")

# plot the density  
ggplot(Beta_Plot, aes_string("Beta_Value", group="Sample_Name",label = "Sample_Name"))+
  geom_density(size=0.2, alpha = 0.5)+
  theme_bw()+
    labs(
    title = "DNA Methylation density (selected clock probes = 41353)",
    x     = "DNAm beta value"
  )

DNAm_denisty_clock

Key takeaway: The bimodal distribution: with a peak near 0 (fully unmethylated) and near 1 (fully methylated),is expected in whole blood DNA methylation profiles.

📌 Note the bimodal distribution of selected clock probes has a different pattern compared to all probes on the array.

DNA methylation beta value density

Principal Component Analysis

PCA analysis

sexProbeIDs <- readRDS("data/sexProbeIDs.rds") # there are 886 probes mapped to Chr X or Y in these 41353 probes. Remove sex probes before running PCA
  
betas <- beta_matrix[!(rownames(beta_matrix)%in%rownames(sexProbeIDs)), ]
# PCA
PCA_full<-princomp(na.omit(betas))
Loadings<-as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance<-vars/sum(vars)

paste0(round(Importance*100, 2)[1:7], "%") #print the variance accounted for the top PCs

The top PC (PC0) accounted for 97.65% of the total variance, is dropped because it mainly represent the bi-modal distribution of the beta value, the rest of PCs are re-calibrated

### PCA ANOVA and correlation tests and plots----
f_prefix <- "PCA_anova_correlation"
p_title <- "R2 of top PCs and variable associations \n(clock probes)"
f_out_p <- paste0("results/",f_prefix, ".png")

PC_view_num <- 7 # this number included PC0, however, PC0 is not included in the association tests

#check if loading rownames are identical to Sample name
identical(rownames(Loadings), SampleInfo$Sample_Name)

# adjust importance (total variance accounted) after removing PC0
adjust<-1-Importance[1]
pca_adjusted_Importance<-Importance[2:length(Importance)]/adjust # new adjusted importance
names(pca_adjusted_Importance) <- paste0("PC", 1:length(pca_adjusted_Importance)) #must update Importance names to the new PC1
#Scree plot: variance explained per PC (PC0 removed)
tibble(
  PC              = paste0("PC", 1:6),
  VarExplained    = pca_adjusted_Importance[1:6]
) %>%
  mutate(PC = factor(PC, levels = PC)) %>%
  ggplot(aes(x = PC, y = VarExplained*100)) +
  geom_col(fill = "#2E75B6", colour = "white") +
  geom_line(aes(group = 1), colour = "#E74C3C", linewidth = 0.8) +
  geom_point(colour = "#E74C3C", size = 2) +
  theme_bw()+
    geom_text(
    aes(label = paste0(round(VarExplained * 100, 1), "%")),
    vjust = -0.5,
    fontface = "bold",
    size = 3.5
  ) +
  labs(
    title    = "Scree Plot: Variance Explained by adjusted top PCs (clock probes)",
    x        = "Principal Component (adjusted)",
    y        = "Variance Explained (%)"
  )

scree

Top PCs and association with variables and predicted cell-type proportions

Understand the main drivers of variation of the DNA methylation data.

meta_categorical <- SampleInfo %>% dplyr::select(Cohort,Sex)
meta_continuous <- SampleInfo %>% dplyr::select(Age,
                                         Bas, Bmem, Bnv, CD4mem, CD4nv, # DNA methylation predicted cell types
                                         CD8mem, CD8nv, Eos, Mono, Neu,
                                         NK, Treg)


pca_df<-data.frame(adjusted_variance=pca_adjusted_Importance, PC=seq(1:length(pca_adjusted_Importance)))

Loadings_association <- Loadings[, 2:PC_view_num] # remove PC0 loading, show PC1-PC_view_num
# identical(Loadings_association %>% rownames(), SampleInfo$Sample_Name)



tmp <- wrapperPlotPCAR2(Loadings=Loadings_association,
                        Importance = pca_adjusted_Importance,
                        meta_categorical=meta_categorical,
                        meta_continuous = meta_continuous,
                        P_threshold = 0.05,
                        p_title=p_title,
                        geom.text.size = 3,
                        round_digit=2,
                        show_legend=T, multiple_test_correction = T)
p_PCA_association <- tmp[[1]]
adj_test_results <- tmp[[2]] # the test results

ggsave(plot = p_PCA_association, filename = paste0(f_out_p), width = 6, height = 8, dpi = 300)


Association of top PCs with biological variables

Association of variables and loadings of the top 6 adjusted PCs. An overall F-test was carried out for separate simple linear regression models for each combination of PCs as dependent variable and the predictor variables: cohort, sex, age, and 12 cell-type proportions. R² is indicated in each cell; variance accounted (adjusted) per PC is shown in brackets on the x-axis. P-values are Bonferroni adjusted.

The same association tests using all array probes show a similar pattern:

Association of top PCs using all EPIC array probes

PCA scatter plots

Sex separates clearly along adjusted PC2.

# PC1 vs PC2 coloured by sex
pca_loadings <- cbind(SampleInfo, data.frame(PC1=Loadings[,1], PC2=Loadings[,2], PC3=Loadings[,3]))
p_sex <- ggplot(pca_loadings , aes(x = PC1, y = PC2, colour = Sex)) +
  geom_point(alpha = 0.8, size = 2.5) +
  scale_colour_manual(values = c("M" = "#2980B9", "F" = "#E74C3C")) +
  theme_bw()+
  labs(
    title    = "PCA of clock probes — Coloured by Sex",
    x = paste0("PC1 (", round(pca_adjusted_Importance[1]*100, 1), "%)"),
    y = paste0("PC2 (", round(pca_adjusted_Importance[2]*100, 1), "%)")
  )

p_sex

PCA_by_sex

Neutrophil proportion is significantly correlated with adjusted PC3.

#PC scores vs cell-type proportions
ggplot(pca_loadings, aes(x = PC3, y = Neu)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red", fill = "lightpink") +
  stat_cor(method = "pearson", label.x.npc = "left", label.y.npc = "top") +
  labs(
    title = "PC2 vs Neutrophils (clock probes)",
    y = "Neutrophils (Neu)"
  ) +
  theme_classic()

PCA_neu

Key Takeaways — Activity 1

</div>


Activity 2: Calculating Epigenetic Clocks with methylCIPHER

Goal

Use the methylCIPHER and DunedinPACE packages to compute a panel of established epigenetic clocks spanning first-, second-, and third-generation approaches applicable to whole blood samples. </div>

Overview of epigenetic clock generations

Generation Example Clocks Trained To Predict Training Tissue
1st Horvath (2013) Chronological age multiple tissues including blood
1st Hannum (2013) Chronological age Blood
2nd PhenoAge (Levine 2018) Incorporate clinical biomarkers to predict phenotypic age Blood
2nd GrimAge (Lu 2019) Incorporate DNAm derived biomarkers to predict mortality (“time-to-death”) Blood
3rd DunedinPACE (Belsky 2022) Rate of biological aging (pace of aging) Blood

Calculating clocks with methylCIPHER

SampleInfo <- readRDS("data/SampleInfo_n44_VHAS_CALERIE.rds")
beta_matrix <- readRDS("data/matrix_x_clock_probes_n44.rds")
# dim(beta_matrix) # 41353    44
# ── methylCIPHER clock calculation 
beta_matrix <- beta_matrix
## clean up meta for clock calculation----
clock_meta <- SampleInfo[ , c("Sample_Name", "Sex","Age")]
colnames(clock_meta) <- c("Sample_Name", "Female", "Age")
clock_meta$Female <- gsub("female", "F", clock_meta$Female, ignore.case = T)
clock_meta$Female <- (clock_meta$Female=="F")*1 # must be F=1, M=0


identical(SampleInfo$Sample_Name, colnames(beta_matrix)) # must be T

#[methylCIPHER] calculate by methylCIPHER----
beta_matrix_t <- t(beta_matrix) # row as sample, and column as cpgs


x_all <- data.frame(Sample_Name = rownames(beta_matrix_t))

predictor_ls <- c("Horvath1", "Hannum", # first generation "Horvath1" = Horvath pan-tissue
                  "PhenoAge") # second generation
Horvath_age <- calcHorvath1(DNAm = beta_matrix_t, pheno = NULL, imputation = F) #If choose imputation = T, Need to provide of named vector of CpG Imputations
Hannum_age <- calcHannum(DNAm = beta_matrix_t, pheno = NULL, imputation = F)
PhenoAge <- calcPhenoAge(DNAm = beta_matrix_t, pheno = NULL, imputation = F)
GrimAge <- calcGrimAgeV1(beta_matrix_t, clock_meta)$GrimAgeV1 #second generation

Calculating DunedinPACE

library(DunedinPACE)
DunedinPACE <- PACEProjector(betas = beta_matrix,proportionOfProbesRequired = 0.7) %>% unlist()
SampleInfo_clock <- cbind(SampleInfo, data.frame(Horvath_age = Horvath_age, Hannum_age = Hannum_age,
                                           PhenoAge=PhenoAge, GrimAge = GrimAge, DunedinPACE = DunedinPACE))
# saveRDS(SampleInfo_clock, "data/SampleInfo_n44_VHAS_CALERIE_clocks_pre_calculated.rds")

Clock and age correlation and error

SampleInfo_clock <- readRDS("data/SampleInfo_n44_VHAS_CALERIE_clocks_pre_calculated.rds")
plotClockMdAE(SampleInfo_clock, "Horvath_age")
plotClockMdAE(SampleInfo_clock, "Hannum_age")
plotClockMdAE(SampleInfo_clock, "PhenoAge")
plotClockMdAE(SampleInfo_clock, "GrimAge")

clock_Horvath_Age clock_Hannum_Age clock_PhenoAge_Age clock_GrimAge_Age

Clock and age inter-correlation heatmap


library(pheatmap)
clock_with_age <- SampleInfo_clock %>% dplyr::select(Age, Horvath_age, Hannum_age, PhenoAge, GrimAge, DunedinPACE)
row.names(clock_with_age) <- SampleInfo_clock$Sample_Name
  
# Compute Pearson correlation matrix
cor_mat <- cor(clock_with_age, use = "pairwise.complete.obs")
diag(cor_mat) <- NA

pheatmap(
  cor_mat,
  display_numbers = TRUE,
  number_format   = "%.2f",
  fontsize_number = 10,
  main            = "Pearson Correlation: Epigenetic Clocks and Chronological Age",
  angle_col       = 45,
  na_col          = "lightgrey",
)

clock_heatmap

Key Takeaways — Activity 2

</div>


Activity 3: Computing Epigenetic Age Difference (EAD) and Epigenetic Age Acceleration (EAA)

Goal

Derive Epigenetic Age Difference (EAD) and Epigenetic Age Acceleration (EAA) from clock estimates. visualize both metrics and explore their inter-correlations to understand the differences between these two measures and make informed choice for downstream association analyses.

Definitions and calculations

Epigenetic Age Difference (EAD):

\[\text{EAD} = \text{Epigenetic Age} - \text{Chronological Age}\]

Epigenetic Age Acceleration (EAA) — residual from a linear regression of epigenetic age on chronological age:

\(\text{Epigenetic Age}_i = \alpha + \beta \cdot \text{Chronological Age}_i + \varepsilon_i\) \(\text{EAA}_i = \varepsilon_i\)

📌 DunedinPACE is a rate-based clock (units: pace of aging, not years). EAA and EAD are not calculated for DunedinPACE.

clocks <- c("Horvath_age", "Hannum_age", "PhenoAge", "GrimAge")
eaa_names <- c("Horvath_EAA", "Hannum_EAA", "PhenoAge_EAA", "GrimAge_EAA")
ead_names <- c("Horvath_EAD", "Hannum_EAD", "PhenoAge_EAD", "GrimAge_EAD")

EAA_EDA_results <- data.frame(Sample_Name = SampleInfo_clock$Sample_Name, DunedinPACE = SampleInfo_clock$DunedinPACE)
for (i in seq_along(clocks)) {
  EAA_EDA_results[[eaa_names[i]]] <- lm(SampleInfo_clock[[clocks[i]]] ~ SampleInfo_clock$Age)$residuals # calculate EAA,sometimes also called extrinsic EAA
  EAA_EDA_results[[ead_names[i]]] <- SampleInfo_clock[[clocks[i]]] - SampleInfo_clock$Age # EAD: clock and age differences
}

Visualize EAD, EAA, and DunedinPACE correlations

EAD and EAA generally show higher correlation within the same clock than across clocks, reflecting clock-specific methylation signatures.

row.names(EAA_EDA_results) <- EAA_EDA_results$Sample_Name
# Compute Pearson correlation matrix
cor_mat <- cor(EAA_EDA_results[,-1], use = "pairwise.complete.obs")
diag(cor_mat) <- NA

vars <- grep("EAA|EAD", colnames(EAA_EDA_results), value = T)

annotation_col <- data.frame(
  Type = ifelse(grepl("EAA", vars), "EAA", "EAD"),
  row.names = vars
)

pheatmap(
  cor_mat,
  display_numbers = TRUE,
  number_format   = "%.2f",
  fontsize_number = 10,
  main            = "Pearson Correlation: Epigenetic Clocks EAA and EAD correlation",
  angle_col       = 45,
  na_col          = "lightgrey",
  annotation_col  = annotation_col,
  annotation_row  = annotation_col
)

clock_EAA_EAD

EAA, EAD vs Age

clocks <- c("Horvath", "Hannum", "PhenoAge", "GrimAge")
plot_list <- list()

for (i in seq_along(clocks)) {
  df_long <- data.frame(EAA = EAA_EDA_results[, paste0(clocks[i], c("_EAA"))],
                        EAD = EAA_EDA_results[, paste0(clocks[i], c("_EAD"))],
                        Age = SampleInfo_clock$Age) %>%
    pivot_longer(cols = c("EAA", "EAD"), names_to = "Measure", values_to = "Value")
  
  plot_list[[i]] <- ggplot(df_long, aes(x = Age, y = Value, colour = Measure)) +
    geom_point(alpha = 0.6, size = 2) +
    geom_smooth(method = "lm", se = FALSE) +
    stat_cor(method = "pearson", label.x.npc = "left", label.y.npc = "top") +
    scale_colour_manual(values = c(EAA = "#2E75B6", EAD = "#E74C3C")) +
    labs(
      x      = "Age",
      y      = "EAA/EAD",
      colour = "Measure",
      title  = paste0(clocks[i])
    ) +
    theme_bw()
}

# Combine with shared legend
p_EAA_EAD <- wrap_plots(plot_list, ncol = 2, guides = "collect") &
  theme(legend.position = "bottom")

ggsave(plot = p_EAA_EAD, filename = paste0("results/EAA_EAD_vs_Age.png"), width = 6, height = 6, dpi = 300)


Relationship between EAA, EAD, and chronological age

Key Takeaways — Activity 3


Activity 4: Association Analyses — Biological Aging and Clinical Variables

Goal

Perform association analysis using robust linear regression to test the relationship between EAA and clinical/health-related variables. The primary exposure is ACHD surgery status (Fontan surgery vs. control), as studied in Jain & Zhuang et al. 2026. Model comparisons with and without age or cell-type correction. </div>

Sample demographics

SampleInfo_ACHD <- readRDS("data/SampleInfo_ACHD_n33.rds") 
library(table1)
df <- SampleInfo_ACHD
df$Sex    <- factor(df$Sex)
df$surgery <- factor(df$surgery)

# Optional: add labels
label(df$Age)    <- "Age (years)"
label(df$Sex)    <- "Sex"
label(df$surgery) <- "Surgery"


# Create Table 1
table1(~ Age + Sex|surgery, data = df)

table1_ACHD

📌 The control group is significantly older than the Fontan surgery group.
ggplot(SampleInfo_ACHD, aes(x = surgery, y = Age, fill = surgery)) +
  geom_boxplot(outlier.shape = 21, width = 0.5, alpha = 0.8) +
  geom_jitter(width = 0.15, alpha = 0.4, size = 1.5) +
  stat_compare_means(
    comparisons = list(c(levels(SampleInfo_ACHD$surgery)[1], 
                         levels(SampleInfo_ACHD$surgery)[2])),
    method = "wilcox.test",
    label = "p.format"
  ) +
  scale_fill_manual(values = c("#4E9AF1", "#F4845F")) +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.10))) +
  labs(
    title = "Age by Surgery Group",
    x = "Surgery",
    y = "Age (years)"
  ) +
  theme_classic(base_size = 13) +
  theme(legend.position = "none")

ACHD_age_by_surgery

EAA calculation

clocks <- c("Horvath_age", "Hannum_age", "PhenoAge", "GrimAge")
eaa_names <- c("Horvath_EAA", "Hannum_EAA", "PhenoAge_EAA", "GrimAge_EAA")

for (i in seq_along(clocks)) {
  SampleInfo_ACHD[[eaa_names[i]]] <- lm(SampleInfo_ACHD[[clocks[i]]] ~ SampleInfo_ACHD$Age)$residuals # calculate EAA,sometimes also called extrinsic EAA (EEAA)
}

EAA association analysis

Cell-type PCs 1–3 account for >90% of the variance in the predicted 12 immune cell-type proportions and are used as covariates to correct for immune composition differences between surgery groups.

model:

EAA ~ surgery + Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3
📌 Since rlm() downweights outliers, the R² here reflects the robust fit, a more outlier-resistant effect-size estimate than standard OLS R² in regular linear regression.
library(MASS)
library(sfsmisc)
library(robustbase)

calculateCohenF <- function(rlm_model, rlm_reduced){
  # Extract R² for each model
  r2 <- function(model) {
    ss_res  <- sum(model$residuals^2)
    ss_tot  <- sum((model$model[[1]] - mean(model$model[[1]]))^2)
    1 - ss_res / ss_tot
  }
  
  r2_full    <- r2(rlm_model)
  r2_reduced <- r2(rlm_reduced)
  
  # Cohen's f² for surgery
  f2_surgery <- (r2_full - r2_reduced) / (1 - r2_full)
  
  # Cohen's f
  f_surgery <- sqrt(f2_surgery)
  return(f_surgery)
}


runRlm <- function(clock, input_meta) {
  input_meta$clock_EAA <- input_meta[, clock]
  

    full    <- rlm(clock_EAA ~ surgery + Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta) # the association model
    reduced <- rlm(clock_EAA ~          Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta) # for calculating cohen's f

  
  data.frame(
    EAA        = clock,
    pvalue     = f.robftest(full, var = "surgeryFontan")$p.value,
    cohenF     = calculateCohenF(full, reduced)
  )
}


input_meta <- SampleInfo_ACHD
levels(input_meta$surgery) # check the factor levels
variable_ls <- c("surgeryFontan") # use control as baseline to extract pvalue

clocks <- c("Horvath_EAA", "Hannum_EAA", "GrimAge_EAA", "PhenoAge_EAA", "DunedinPACE")


results <- do.call(rbind, lapply(clocks, function(clock) {
  rbind(
    runRlm(clock, input_meta)
  )
}))

rownames(results) <- NULL
results$p_adj <- p.adjust(results$pvalue, method = "bonferroni") #multiple testing correction
print(results)

Comparison: models with and without Age as a covariate



runRlmAge <- function(clock, input_meta, include_age) {
  input_meta$clock_EAA <- input_meta[, clock]
  
  if (include_age) {
    full    <- rlm(clock_EAA ~ surgery + Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
    reduced <- rlm(clock_EAA ~          Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
  } else {
    full    <- rlm(clock_EAA ~ surgery + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
    reduced <- rlm(clock_EAA ~          Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
  }
  
  data.frame(
    EAA        = clock,
    model      = ifelse(include_age, "with_Age", "without_Age"),
    pvalue     = f.robftest(full, var = "surgeryFontan")$p.value,
    cohenF     = calculateCohenF(full, reduced)
  )
}


input_meta <- SampleInfo_ACHD
levels(input_meta$surgery) # check the factor levels
variable_ls <- c("surgeryFontan") # use control as baseline to extract pvalue

clocks <- c("Horvath_EAA", "Hannum_EAA", "GrimAge_EAA", "PhenoAge_EAA", "DunedinPACE")


results_age <- do.call(rbind, lapply(clocks, function(clock) {
  rbind(
    runRlmAge(clock, input_meta, include_age = TRUE),
    runRlmAge(clock, input_meta, include_age = FALSE)

  )
}))

rownames(results_age) <- NULL
print(results_age)

In this dataset, including age as a covariate increases statistical significance (decreases p-value). Age was acting as a suppressor — the groups differ in age, and that difference was partially masking the EAA-surgery signal. Once Age is controlled for, the model more cleanly isolates the independent effect of surgery on EAA.

In this dataset, including age as a covariate increases statistical significance (decreases p-value). Age was acting as a suppressor — the groups differ in age, and that difference was partially masking the EAA-surgery signal. Once Age is controlled for, the model more cleanly isolates the independent effect of surgery on EAA. Overall, chronological age is typically added as a covariate in EAA association analysis.

Comparison: models with and without cell-type PCs


clocks <- c("Horvath_EAA", "Hannum_EAA", "GrimAge_EAA", "PhenoAge_EAA", "DunedinPACE")

runRlmCell <- function(clock, input_meta, include_cell) {
  input_meta$clock_EAA <- input_meta[, clock]
  
  if (include_cell) {
    full    <- rlm(clock_EAA ~ surgery + Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
    reduced <- rlm(clock_EAA ~          Age + Sex + Ancestry + cell_PC1 + cell_PC2 + cell_PC3, data = input_meta)
  } else {
    full    <- rlm(clock_EAA ~ surgery + Age + Sex + Ancestry , data = input_meta)
    reduced <- rlm(clock_EAA ~          Age + Sex + Ancestry, data = input_meta)
  }
  
  data.frame(
    EAA        = clock,
    model      = ifelse(include_cell, "with_celltypePCs", "without_celltypePCs"),
    pvalue     = f.robftest(full, var = "surgeryFontan")$p.value,
    cohenF     = calculateCohenF(full, reduced)
  )
}

results_cell <- do.call(rbind, lapply(clocks, function(clock) {
  rbind(
    runRlmCell(clock, SampleInfo_ACHD, include_cell = FALSE),
    runRlmCell(clock, SampleInfo_ACHD, include_cell = TRUE)
  )
}))

rownames(results_cell) <- NULL
print(results_cell)
Including cell type PCs increases the p-value for the EAA-surgery association, suggesting that cell type composition was previously confounding this signal. Fontan and control groups likely differ in blood immune cell proportions. Since epigenetic clocks are sensitive to cell-type composition, part of the apparent EAA difference reflects this extrinsic factor rather than intrinsic epigenetic aging. Overall, cell type compositions are typically accounted for EAA association analysis.

Key Takeaways — Activity 4

References

Session Information

# Record software versions for reproducibility 
sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Vancouver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] robustbase_0.99-7  sfsmisc_1.1-24     MASS_7.3-65        pheatmap_1.0.13   
##  [5] DunedinPACE_0.99.0 patchwork_1.3.2    ggpubr_0.6.3       ggplot2_4.0.3     
##  [9] methylCIPHER_0.2.0 broom_1.0.13       table1_1.5.1       reshape2_1.4.5    
## [13] tidyr_1.3.2        dplyr_1.2.1        knitr_1.51        
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10           generics_0.1.4        rstatix_0.7.3        
##  [4] lattice_0.22-7        stringi_1.8.7         digest_0.6.39        
##  [7] magrittr_2.0.5        evaluate_1.0.5        grid_4.5.2           
## [10] RColorBrewer_1.1-3    fastmap_1.2.0         Matrix_1.7-4         
## [13] plyr_1.8.9            jsonlite_2.0.0        backports_1.5.1      
## [16] Formula_1.2-5         mgcv_1.9-3            purrr_1.2.2          
## [19] preprocessCore_1.72.0 scales_1.4.0          jquerylib_0.1.4      
## [22] abind_1.4-8           cli_3.6.6             rlang_1.2.0          
## [25] splines_4.5.2         withr_3.0.2           cachem_1.1.0         
## [28] yaml_2.3.12           tools_4.5.2           checkmate_2.3.4      
## [31] ggsignif_0.6.4        vctrs_0.7.3           R6_2.6.1             
## [34] lifecycle_1.0.5       stringr_1.6.0         car_3.1-5            
## [37] pkgconfig_2.0.3       pillar_1.11.1         bslib_0.11.0         
## [40] gtable_0.3.6          glue_1.8.1            Rcpp_1.1.1-1.1       
## [43] DEoptimR_1.2-0        xfun_0.58             tibble_3.3.1         
## [46] tidyselect_1.2.1      farver_2.1.2          nlme_3.1-168         
## [49] htmltools_0.5.9       labeling_0.4.3        rmarkdown_2.31       
## [52] carData_3.0-6         compiler_4.5.2        S7_0.2.2