Overview

quantbayes provides a minimal Bayesian transform for quantifying evidence sufficiency from a binary matrix of zero and one entries. The method is context free and produces reproducible scores across workflows and organisations.

This vignette shows:

Loading the built in dataset
Constructing the evidence matrix
Running the core transform
Producing visual summaries
Highlighting variants
Using file based input
Saving output tables and plots
Customising plot themes

Note: The variant identifier can be any character string. In the example dataset, the identifiers come from standard Whole Genome Sequencing output produced by Exomiser.

1. Load the example dataset

data(core_test_data)
head(core_test_data)

## # A tibble: 6 × 25
##   ID        interpret_flag_gt_va…¹ interpret_flag_moi_p…² interpret_flag_moi_p…³
##   <chr>                      <dbl>                  <dbl>                  <dbl>
## 1 2-542344…                      1                      1                      0
## 2 2-758567…                      1                      0                      1
## 3 2-334257…                      1                      1                      0
## 4 2-229915…                      1                      0                      1
## 5 2-815463…                      1                      0                      1
## 6 16-27979…                      1                      1                      0
## # ℹ abbreviated names: ¹interpret_flag_gt_valid,
## #   ²interpret_flag_moi_parent_gt_missing_mother,
## #   ³interpret_flag_moi_parent_gt_missing_father
## # ℹ 21 more variables: interpret_flag_moi_parent_gt_missing_any <dbl>,
## #   interpret_flag_moi_parent_gt_hom_mother <dbl>,
## #   interpret_flag_moi_parent_gt_hom_father <dbl>,
## #   interpret_flag_moi_parent_gt_hom_any <dbl>, …

2. Build the binary evidence matrix

x <- as.matrix(core_test_data[, -1])
rownames(x) <- core_test_data[[1]]
dim(x)

## [1] 400  24

3. Run quantbayes core

res <- quant_es_core(x)

## Quant ES core: converting 2924 NA values to 0.

res$global

## $n_variants
## [1] 400
## 
## $m_rules
## [1] 24
## 
## $mean_theta
## [1] 0.5192308
## 
## $median_theta
## [1] 0.5384615
## 
## $lower_theta
##      2.5% 
## 0.3846154 
## 
## $upper_theta
##     97.5% 
## 0.6538462 
## 
## $ci_level
## [1] 0.95

head(res$variants)

## # A tibble: 6 × 7
##   variant_id               k     m theta_mean theta_lower theta_upper percentile
##   <chr>                <dbl> <int>      <dbl>       <dbl>       <dbl>      <dbl>
## 1 2-54234474-G-A_AR       18    24      0.731       0.549       0.879       99.9
## 2 2-758567696-A-N_AR      10    24      0.423       0.244       0.613        8  
## 3 2-334257224-G-GAAGG…    10    24      0.423       0.244       0.613        8  
## 4 2-229915282-A-C_AR      10    24      0.423       0.244       0.613        8  
## 5 2-815463976-T-G_AR      10    24      0.423       0.244       0.613        8  
## 6 16-279793-G-N_AD        11    24      0.462       0.278       0.651       23.4

4. Evidence sufficiency plots

plots <- quant_es_plots(res, x)

Global posterior distribution

plots$p_global

Overlay of top candidate densities

plots$p_overlay

Evidence matrix

plots$p_matrix

Observed evidence proportions

plots$p_p_hat

Posterior credible intervals

plots$p_theta_ci

Combined panel

plots$p_combined

## NULL

5. Highlighting variants of interest

highlight_demo <- list(
  list(id = "X-119469998-CT-C_XR", colour = "#ee4035", size = 4),
  list(id = res$variants$variant_id[1], colour = "#2f4356", size = 4)
)

plots2 <- quant_es_plots(res, x, highlight_points = highlight_demo)
plots2$p_overlay

6. File based input

quantbayes provides a helper for reading flat tables of binary evidence. Values must be 0, 1 or NA.

# Extract evidence columns
ev <- core_test_data[, -1]

# Keep only columns containing valid 0, 1, NA entries
is_binary_col <- function(x) all(x %in% c(0, 1, NA))
ev <- ev[, vapply(ev, is_binary_col, logical(1))]

tmpfile <- tempfile(fileext = ".tsv")

write.table(
  ev,
  tmpfile,
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

res_file <- quant_es_from_binary_table(tmpfile, variant_col = NULL)

## Quant ES core: converting 2924 NA values to 0.

res_file$global

## $n_variants
## [1] 400
## 
## $m_rules
## [1] 24
## 
## $mean_theta
## [1] 0.5192308
## 
## $median_theta
## [1] 0.5384615
## 
## $lower_theta
##      2.5% 
## 0.3846154 
## 
## $upper_theta
##     97.5% 
## 0.6538462 
## 
## $ci_level
## [1] 0.95

7. Saving output tables

outdir <- tempdir()
if (!dir.exists(outdir)) dir.create(outdir)

write.csv(res$variants,
          file.path(outdir, "variants_results.csv"),
          row.names = FALSE)

write.csv(as.data.frame(res$global),
          file.path(outdir, "global_summary.csv"),
          row.names = FALSE)

8. Saving plots

ggsave(file.path(outdir, "overlay.png"),
       plots$p_overlay,
       width = 7,
       height = 4,
       dpi = 120)

ggsave(file.path(outdir, "matrix.png"),
       plots$p_matrix,
       width = 7,
       height = 6,
       dpi = 120)

9. Customising plot themes

plots$p_overlay + theme(legend.position = "none")

plots$p_overlay + theme_minimal()

plots$p_overlay + theme_void()

pal10 <- colorRampPalette(c("black", "grey"))(10)
pal20 <- colorRampPalette(c("skyblue", "navy"))(20)

plots_custom <- quant_es_plots(
  res,
  x,
  palette10 = pal10,
  palette20 = pal20
)

plots_custom$p_overlay

10. Flagship overlay plot

# Two highlighted candidates
swiss_red <- "#ee4035"
federal_blue <- "#2f4356"

# provide the points to highlight
highlight_flagship <- list(
  list(id = core_test_data[[1]][1], colour = swiss_red, size = 4),
  list(id = "6-32664815-C-G_AR", colour = federal_blue, size = 4)
)

plots_flagship <- quant_es_plots(
  res,
  x,
  highlight_points = highlight_flagship
)

# add custom ggplot settings
flagship_plot <- plots_flagship$p_overlay +
  ggplot2::guides(
    fill = ggplot2::guide_legend(title = "highlighted variants"),
    size = "none"
  ) +
  ggplot2::labs(
    title = "Posterior theta distribution",
    subtitle = "Top 10 CrI estimates with evidence available\nand two highlighted variants"
  ) +
  ggplot2::theme(
    legend.position = "right",
    legend.title = ggplot2::element_text(size = 10),
    legend.text = ggplot2::element_text(size = 9)
  )

flagship_plot

# Save flagship plot
outdir <- tempdir()
if (!dir.exists(outdir)) dir.create(outdir)

ggplot2::ggsave(
  filename = file.path(outdir, "flagship_plot.pdf"),
  plot = flagship_plot,
  width = 8,
  height = 4
)

Here is a minimal vignette block that includes the commands that print the human readable report strings, fully aligned with your style (British spelling, no em dashes, short, quiet).

Paste this directly into the vignette.

Core results example

Clinical genetics example

After Whole Genome Sequencing, a candidate selection tool identified possible causal variants.
Each was evaluated using a minimal independent evidence set.

res_df <- as.data.frame(res$variants)
global_df <- as.data.frame(res$global)

res_df <- res_df[order(res_df$theta_mean, decreasing = TRUE), ]

head(res_df)

##              variant_id  k  m theta_mean theta_lower theta_upper percentile
## 1     2-54234474-G-A_AR 18 24  0.7307692   0.5487120   0.8792833     99.875
## 276  6-72183475-CG-N_AR 18 24  0.7307692   0.5487120   0.8792833     99.875
## 84    1-14682421-G-A_AD 17 24  0.6923077   0.5061232   0.8505046     99.375
## 330  7-751853912-C-T_AR 17 24  0.6923077   0.5061232   0.8505046     99.375
## 44  X-224319469-CT-C_XR 16 24  0.6538462   0.4649993   0.8202832     97.000
## 51  X-414698664-CT-C_XD 16 24  0.6538462   0.4649993   0.8202832     97.000

Example output:

variant_id	k	m	theta_mean	theta_lower	theta_upper	percentile
2-54234474-G-A_AR	18	24	0.7308	0.5487	0.8793	99.875
6-72183475-CG-N_AR	18	24	0.7308	0.5487	0.8793	99.875
1-14682421-G-A_AD	17	24	0.6923	0.5061	0.8505	99.375
7-751853912-C-T_AR	17	24	0.6923	0.5061	0.8505	99.375
X-224319469-CT-C_XR	16	24	0.6538	0.4650	0.8203	97.000
X-414698664-CT-C_XD	16	24	0.6538	0.4650	0.8203	97.000

Report text (printed automatically)

The code below generates the concise human readable report strings.

# choose the top variant
v <- res_df[1, ]
g <- global_df

variant_line <- sprintf(
  "Posterior evidence sufficiency: %.3f (credible interval %.3f to %.3f, percentile %.2f)",
  v$theta_mean,
  v$theta_lower,
  v$theta_upper,
  v$percentile
)

global_line <- sprintf(
  "Global evidence sufficiency: %.2f (credible interval %.2f to %.2f)",
  g$mean_theta,
  g$lower_theta,
  g$upper_theta
)

variant_line

## [1] "Posterior evidence sufficiency: 0.731 (credible interval 0.549 to 0.879, percentile 99.88)"

global_line

## [1] "Global evidence sufficiency: 0.52 (credible interval 0.38 to 0.65)"

This produces:

Posterior evidence sufficiency: 0.731 (credible interval 0.549 to 0.879, percentile 99.88)
Global evidence sufficiency: 0.52 (credible interval 0.38 to 0.65)

Bayesian Quantification of Evidence Sufficiency

Dylan Lawless