## ----load-depenencies, echo=TRUE, message=FALSE, warning=FALSE----------------
library(sesame)
library(SummarizedExperiment)
library(stats4)
library(sesameData)

## ----list-data, eval=TRUE, echo=TRUE------------------------------------------
sesameDataCache(keyword='KYCG')
databaseSetNames = sesameData:::df_master$Title[grepl('KYCG', sesameData:::df_master$Title)]

## ----print-data, eval=TRUE, echo=TRUE-----------------------------------------
head(databaseSetNames)

## ----cache-data, eval=TRUE, warning=FALSE-------------------------------------
databaseSetNames = c('KYCG.MM285.seqContextN.20210630',
                     'KYCG.MM285.designGroup.20210210',
                     'KYCG.MM285.chromosome.mm10.20210630',
                     'KYCG.MM285.probeType.20210630')
databaseSets = do.call(c, lapply(databaseSetNames, sesameDataGet))

## ----view-data0, eval=TRUE, warning=FALSE-------------------------------------
sprintf("length(databaseSets) = %s database sets", length(databaseSets))

## ----view-data1, eval=TRUE, warning=FALSE-------------------------------------
str(databaseSets[1:3])

## ----view-data2, eval=TRUE, warning=FALSE-------------------------------------
summary(databaseSets[1:3])

## ----cache-data2, eval=TRUE, echo=TRUE----------------------------------------
MM285.tissueSignature = sesameDataGet('MM285.141.SE.tissueSignature')
df = rowData(MM285.tissueSignature)
querySet = df$Probe_ID[df$branch == "E-Brain"]

## ----view-data4, eval=TRUE, echo=FALSE----------------------------------------
cat(sprintf("length(querySet) = %s probes", length(querySet)))

## ----run-annotation, echo=TRUE, eval=TRUE-------------------------------------
annotation = getDatabaseSetOverlap(querySet, databaseSets)
head(annotation)

## ----echo = FALSE, results="asis"---------------------------------------------
library(knitr)
df = data.frame(
    "Continuous DB"=c("Correlation","FGSEA"),
    "Discrete DB"=c("FGSEA","Fisher's Exact Test"))
rownames(df) = c("Continuous Query", "Discrete Query")
kable(df, caption="Four KnowYourCG Testing Scenarios")

## ----run-test-single, echo=TRUE, eval=TRUE------------------------------------
results = testEnrichment(querySet=querySet, 
                               databaseSets=databaseSets, 
                               verbose=FALSE)
print(head(results))

## ----plot-volcano, fig.width=7, fig.height=6, echo=TRUE, warning=FALSE--------
plotVolcano(data=results, title="Database Set Enrichment", 
            subtitle="MM285 Mouse Platform")

## ----plot-lollipop, fig.width=7, fig.height=6, echo=TRUE----------------------
plotLollipop(data=results, title="Top Database Set Enrichment", 
             subtitle="MM285 Mouse Platform")

## ----run-test-data, echo=TRUE, eval=TRUE--------------------------------------
KYCG.MM285.seqContextN.20210630 = sesameDataGet('KYCG.MM285.seqContextN.20210630')
CpGDesity50 = KYCG.MM285.seqContextN.20210630['CpGDesity50']
distToTSS = KYCG.MM285.seqContextN.20210630['distToTSS']

## ----run-test-other1, echo=TRUE, eval=TRUE------------------------------------
resultsCpGDensity = testEnrichment(querySet=querySet,
                                      databaseSets=CpGDesity50,
                                      platform="MM285")
print(resultsCpGDensity)

## ----run-test-other2, echo=TRUE, eval=TRUE------------------------------------
resultsTSS = testEnrichment(querySet=querySet, 
                               databaseSets=distToTSS, 
                               platform="MM285")
print(resultsTSS)

## ----run-test-other3, echo=TRUE, eval=TRUE------------------------------------
resultsCpGdensityTSS = testEnrichment(querySet=CpGDesity50[[1]],
                                         databaseSets=distToTSS,
                                         platform="MM285")
print(resultsCpGdensityTSS)

## ----load-data-tfbs, echo=TRUE, eval=TRUE-------------------------------------
databaseSets = sesameDataGet("KYCG.MM285.TFBS.20210817")

## ----run-test-tfbs, echo=TRUE, eval=TRUE, warning=FALSE-----------------------
resultsTFBS = testEnrichment(querySet=querySet, databaseSets=databaseSets,
                                platform="MM285", verbose=FALSE, 
                                return.meta=TRUE)
head(resultsTFBS)

## ----plot-volcano-tfbs, fig.width=7, fig.height=6, echo=TRUE------------------
plotVolcano(data=resultsTFBS, title="Transcription Factor Binding Site Enrichment",
            subtitle='MM285 Mouse Platform', alpha=0.0005)

## ----plot-lollipop-tfbs, fig.width=7, fig.height=6, echo=TRUE-----------------
plotLollipop(data=resultsTFBS, title="Transcription Factor Binding Site Enrichment",
             subtitle='MM285 Mouse Platform')

## ----run-test-gene, fig.width=7, fig.height=6, echo=TRUE, warning=FALSE-------
resultsGene = testEnrichmentGene(querySet, platform="MM285", verbose=FALSE)
head(resultsGene)

## ----plot-volcano-gene, fig.width=7, fig.height=6, echo=TRUE------------------
plotVolcano(data=resultsGene, title="Gene Enrichment",
            subtitle="MM285 Mouse Platform", n.fdr=TRUE)

## ----plot-lollipop-gene, fig.width=7, fig.height=6, echo=TRUE-----------------
plotLollipop(data=resultsGene, title="Top Gene Enrichment", 
             subtitle="MM285 Mouse Platform", n=10)

## ----run-feature-engineering-get-data, echo=TRUE, eval=TRUE-------------------
se = sesameDataGet('MM285.20Kx467.SE')
samplesheet = colData(se)[, c("Mouse_Age_Months",
         "Mouse_Age_Days", "Sex", "Strain_Corrected",
         "Tissue_Corrected", 'Genotype')]
betas = assay(se)

print(head(samplesheet))

## ----run-feature-engineering-statistics---------------------------------------
databaseSets = do.call(c, lapply(databaseSetNames, sesameDataGet))
statistics = calcDatabaseSetStatisticsAll(betas, databaseSets=databaseSets)
head(statistics[, 1:5])

## ----feature-engineering-subsetting-statistics--------------------------------
statistics = statistics[, grepl("mean", colnames(statistics))]
head(statistics[, 1:5])

## ----feature-engineering-data-curation, warning=FALSE-------------------------
data = cbind(data.frame(samplesheet), statistics)

data$Sex = relevel(factor(data$Sex), 'Female')
data$Strain_Corrected = relevel(factor(data$Strain_Corrected), '129/Sv')
data$Tissue_Corrected = relevel(factor(data$Tissue_Corrected),  'Colon')
data$Genotype = relevel(factor(data$Genotype), 'WT')

data$Mouse_Age_Days = as.numeric(data$Mouse_Age_Days)
data$Mouse_Age_Months = as.numeric(data$Mouse_Age_Months)

## ----feature-engineering-linear-model-----------------------------------------
model = lm(Mouse_Age_Days ~ Sex + Strain_Corrected + Tissue_Corrected + 
               Genotype + `CGI-mean` + `CTCF-mean` + `Clock-mean` + 
               `SNP-mean` + `SpermMeth-mean` + `VMR-mean`, data=data)

## -----------------------------------------------------------------------------
sessionInfo()

