Bayesian Quantification of Evidence Sufficiency

Dylan Lawless

Overview

quantbayes provides a minimal Bayesian transform for quantifying evidence sufficiency from a binary matrix of zero and one entries. The method is context free and produces reproducible scores across workflows and organisations.

This vignette shows:

  1. Loading the built in dataset
  2. Constructing the evidence matrix
  3. Running the core transform
  4. Producing visual summaries
  5. Highlighting variants
  6. Using file based input
  7. Saving output tables and plots
  8. Customising plot themes

Note: The variant identifier can be any character string. In the example dataset, the identifiers come from standard Whole Genome Sequencing output produced by Exomiser.


1. Load the example dataset

data(core_test_data)
head(core_test_data)
## # A tibble: 6 × 25
##   ID        interpret_flag_gt_va…¹ interpret_flag_moi_p…² interpret_flag_moi_p…³
##   <chr>                      <dbl>                  <dbl>                  <dbl>
## 1 2-542344…                      1                      1                      0
## 2 2-758567…                      1                      0                      1
## 3 2-334257…                      1                      1                      0
## 4 2-229915…                      1                      0                      1
## 5 2-815463…                      1                      0                      1
## 6 16-27979…                      1                      1                      0
## # ℹ abbreviated names: ¹​interpret_flag_gt_valid,
## #   ²​interpret_flag_moi_parent_gt_missing_mother,
## #   ³​interpret_flag_moi_parent_gt_missing_father
## # ℹ 21 more variables: interpret_flag_moi_parent_gt_missing_any <dbl>,
## #   interpret_flag_moi_parent_gt_hom_mother <dbl>,
## #   interpret_flag_moi_parent_gt_hom_father <dbl>,
## #   interpret_flag_moi_parent_gt_hom_any <dbl>, …

2. Build the binary evidence matrix

x <- as.matrix(core_test_data[, -1])
rownames(x) <- core_test_data[[1]]
dim(x)
## [1] 400  24

3. Run quantbayes core

res <- quant_es_core(x)
## Quant ES core: converting 2924 NA values to 0.
res$global
## $n_variants
## [1] 400
## 
## $m_rules
## [1] 24
## 
## $mean_theta
## [1] 0.5192308
## 
## $median_theta
## [1] 0.5384615
## 
## $lower_theta
##      2.5% 
## 0.3846154 
## 
## $upper_theta
##     97.5% 
## 0.6538462 
## 
## $ci_level
## [1] 0.95
head(res$variants)
## # A tibble: 6 × 7
##   variant_id               k     m theta_mean theta_lower theta_upper percentile
##   <chr>                <dbl> <int>      <dbl>       <dbl>       <dbl>      <dbl>
## 1 2-54234474-G-A_AR       18    24      0.731       0.549       0.879       99.9
## 2 2-758567696-A-N_AR      10    24      0.423       0.244       0.613        8  
## 3 2-334257224-G-GAAGG…    10    24      0.423       0.244       0.613        8  
## 4 2-229915282-A-C_AR      10    24      0.423       0.244       0.613        8  
## 5 2-815463976-T-G_AR      10    24      0.423       0.244       0.613        8  
## 6 16-279793-G-N_AD        11    24      0.462       0.278       0.651       23.4

4. Evidence sufficiency plots

plots <- quant_es_plots(res, x)

Global posterior distribution

plots$p_global

Overlay of top candidate densities

plots$p_overlay

Evidence matrix

plots$p_matrix

Observed evidence proportions

plots$p_p_hat

Posterior credible intervals

plots$p_theta_ci

Combined panel

plots$p_combined
## NULL

5. Highlighting variants of interest

highlight_demo <- list(
  list(id = "X-119469998-CT-C_XR", colour = "#ee4035", size = 4),
  list(id = res$variants$variant_id[1], colour = "#2f4356", size = 4)
)

plots2 <- quant_es_plots(res, x, highlight_points = highlight_demo)
plots2$p_overlay


6. File based input

quantbayes provides a helper for reading flat tables of binary evidence. Values must be 0, 1 or NA.

# Extract evidence columns
ev <- core_test_data[, -1]

# Keep only columns containing valid 0, 1, NA entries
is_binary_col <- function(x) all(x %in% c(0, 1, NA))
ev <- ev[, vapply(ev, is_binary_col, logical(1))]

tmpfile <- tempfile(fileext = ".tsv")

write.table(
  ev,
  tmpfile,
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

res_file <- quant_es_from_binary_table(tmpfile, variant_col = NULL)
## Quant ES core: converting 2924 NA values to 0.
res_file$global
## $n_variants
## [1] 400
## 
## $m_rules
## [1] 24
## 
## $mean_theta
## [1] 0.5192308
## 
## $median_theta
## [1] 0.5384615
## 
## $lower_theta
##      2.5% 
## 0.3846154 
## 
## $upper_theta
##     97.5% 
## 0.6538462 
## 
## $ci_level
## [1] 0.95

7. Saving output tables

outdir <- tempdir()
if (!dir.exists(outdir)) dir.create(outdir)

write.csv(res$variants,
          file.path(outdir, "variants_results.csv"),
          row.names = FALSE)

write.csv(as.data.frame(res$global),
          file.path(outdir, "global_summary.csv"),
          row.names = FALSE)

8. Saving plots

ggsave(file.path(outdir, "overlay.png"),
       plots$p_overlay,
       width = 7,
       height = 4,
       dpi = 120)

ggsave(file.path(outdir, "matrix.png"),
       plots$p_matrix,
       width = 7,
       height = 6,
       dpi = 120)

9. Customising plot themes

plots$p_overlay + theme(legend.position = "none")

plots$p_overlay + theme_minimal()

plots$p_overlay + theme_void()

pal10 <- colorRampPalette(c("black", "grey"))(10)
pal20 <- colorRampPalette(c("skyblue", "navy"))(20)

plots_custom <- quant_es_plots(
  res,
  x,
  palette10 = pal10,
  palette20 = pal20
)

plots_custom$p_overlay


10. Flagship overlay plot

# Two highlighted candidates
swiss_red <- "#ee4035"
federal_blue <- "#2f4356"

# provide the points to highlight
highlight_flagship <- list(
  list(id = core_test_data[[1]][1], colour = swiss_red, size = 4),
  list(id = "6-32664815-C-G_AR", colour = federal_blue, size = 4)
)

plots_flagship <- quant_es_plots(
  res,
  x,
  highlight_points = highlight_flagship
)

# add custom ggplot settings
flagship_plot <- plots_flagship$p_overlay +
  ggplot2::guides(
    fill = ggplot2::guide_legend(title = "highlighted variants"),
    size = "none"
  ) +
  ggplot2::labs(
    title = "Posterior theta distribution",
    subtitle = "Top 10 CrI estimates with evidence available\nand two highlighted variants"
  ) +
  ggplot2::theme(
    legend.position = "right",
    legend.title = ggplot2::element_text(size = 10),
    legend.text = ggplot2::element_text(size = 9)
  )

flagship_plot

# Save flagship plot
outdir <- tempdir()
if (!dir.exists(outdir)) dir.create(outdir)

ggplot2::ggsave(
  filename = file.path(outdir, "flagship_plot.pdf"),
  plot = flagship_plot,
  width = 8,
  height = 4
)

Here is a minimal vignette block that includes the commands that print the human readable report strings, fully aligned with your style (British spelling, no em dashes, short, quiet).

Paste this directly into the vignette.


Core results example

Clinical genetics example

After Whole Genome Sequencing, a candidate selection tool identified possible causal variants.
Each was evaluated using a minimal independent evidence set.

res_df <- as.data.frame(res$variants)
global_df <- as.data.frame(res$global)

res_df <- res_df[order(res_df$theta_mean, decreasing = TRUE), ]

head(res_df)
##              variant_id  k  m theta_mean theta_lower theta_upper percentile
## 1     2-54234474-G-A_AR 18 24  0.7307692   0.5487120   0.8792833     99.875
## 276  6-72183475-CG-N_AR 18 24  0.7307692   0.5487120   0.8792833     99.875
## 84    1-14682421-G-A_AD 17 24  0.6923077   0.5061232   0.8505046     99.375
## 330  7-751853912-C-T_AR 17 24  0.6923077   0.5061232   0.8505046     99.375
## 44  X-224319469-CT-C_XR 16 24  0.6538462   0.4649993   0.8202832     97.000
## 51  X-414698664-CT-C_XD 16 24  0.6538462   0.4649993   0.8202832     97.000

Example output:

variant_id k m theta_mean theta_lower theta_upper percentile
2-54234474-G-A_AR 18 24 0.7308 0.5487 0.8793 99.875
6-72183475-CG-N_AR 18 24 0.7308 0.5487 0.8793 99.875
1-14682421-G-A_AD 17 24 0.6923 0.5061 0.8505 99.375
7-751853912-C-T_AR 17 24 0.6923 0.5061 0.8505 99.375
X-224319469-CT-C_XR 16 24 0.6538 0.4650 0.8203 97.000
X-414698664-CT-C_XD 16 24 0.6538 0.4650 0.8203 97.000

Report text (printed automatically)

The code below generates the concise human readable report strings.

# choose the top variant
v <- res_df[1, ]
g <- global_df

variant_line <- sprintf(
  "Posterior evidence sufficiency: %.3f (credible interval %.3f to %.3f, percentile %.2f)",
  v$theta_mean,
  v$theta_lower,
  v$theta_upper,
  v$percentile
)

global_line <- sprintf(
  "Global evidence sufficiency: %.2f (credible interval %.2f to %.2f)",
  g$mean_theta,
  g$lower_theta,
  g$upper_theta
)

variant_line
## [1] "Posterior evidence sufficiency: 0.731 (credible interval 0.549 to 0.879, percentile 99.88)"
global_line
## [1] "Global evidence sufficiency: 0.52 (credible interval 0.38 to 0.65)"

This produces:

Posterior evidence sufficiency: 0.731 (credible interval 0.549 to 0.879, percentile 99.88)
Global evidence sufficiency: 0.52 (credible interval 0.38 to 0.65)

Summary

quantbayes provides a minimal Bayesian transform for quantifying evidence sufficiency from a binary rule matrix. The package includes tools for plotting, highlighting, file based input, exporting results and saving plots for reports and pipelines.

quantbayes is designed to be simple, portable and easy to integrate in any workflow.