## How it's built:  (note that we do NOT use symbols from the GEO GPL matrix)
require(GEOquery)
hm450k <- getGEO('GPL13534')@dataTable@table
names(hm450k) <- toupper(names(hm450k))
hm27k <- getGEO('GPL8490')@dataTable@table
names(hm27k) <- toupper(names(hm27k))
hm450k$PLATFORM <- 'HM450'
hm450k$CHANNEL27 <- NA
hm450k$CHANNEL450 <- hm450k$COLOR_CHANNEL
hm450k$ADDRESSA_450 <- hm450k$ADDRESSA_ID
hm450k$ADDRESSB_450 <- hm450k$ADDRESSB_ID
hm450k$ADDRESSA_27 <- NA
hm450k$ADDRESSB_27 <- NA
hm27k$PLATFORM <- 'HM27'
hm450k$PLATFORM[ which(hm450k$NAME %in% hm27k$NAME) ] <- 'BOTH'
hm27k$PLATFORM[ which(hm27k$NAME %in% hm450k$NAME) ] <- 'BOTH'
hm27k$CHANNEL27 <- hm27k$COLOR_CHANNEL
hm27k$CHANNEL450 <- NA
hm27k$ADDRESSA_450 <- NA
hm27k$ADDRESSB_450 <- NA
hm27k$ADDRESSA_27 <- hm27k$ADDRESSA_ID
hm27k$ADDRESSB_27 <- hm27k$ADDRESSB_ID
hm27k$STRAND <- '*' ## this we do the brute-force way
commonColumns <- intersect( names(hm450k), names(hm27k) )
rownames(hm450k) <- hm450k$ID
rownames(hm27k) <- hm27k$ID

## this makes it relatively painless to move from hg18 to hg19
df2GR <- function(df, keepColumns=FALSE, ignoreStrand=FALSE){ # {{{
  require(GenomicRanges)
  stopifnot(class(df) == "data.frame")
  subs <- list(chromStart='start', chromEnd='end', chrom='chr', seqnames='chr')
  for(s in names(subs)) names(df) = gsub(s, subs[[s]], names(df), ignore=TRUE)
  stopifnot(all(c("start", "end") %in% names(df)))
  if('genome' %in% names(attributes(df))) g <- attr(df, 'genome') else g <- NULL
  if(substr(df$chr, 1, 3)[1] != 'chr') df$chr <- paste('chr', df$chr, sep='')
  df <- subset(df, !is.na(start) & !is.na(end))
  if(!ignoreStrand && ("strand" %in% names(df))) {
    if(is.numeric(df$strand)) df$strand <- strandMe(df$strand)
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end), strand=strand))
  } else {
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end)))
  }
  if('name' %in% names(df)) {
    names(GR) <- df$name
    df$name <- NULL
  } else {
    names(GR) <- rownames(df)
  }
  if(keepColumns) {
    skipped = c("rangename","chr","start","end","width","strand")
    elementMetadata(GR) <- as(df[, setdiff(names(df), skipped), drop=F],
                              "DataFrame")
  }
  if('X' %in% names(elementMetadata(GR))) {
    if(all(is.na(GR$X))) {
      GR$X <- NULL
    } else {
      names(elementMetadata(GR))[which(names(elementMetadata(GR))=='X')]='score'
    }
  }
  if(!is.null(g)) genome(GR) <- g
  return(GR)
} # }}}

## now we need to update the hm27k probe annotations:
hm27k$start <- hm27k$end <- as.numeric(hm27k$MAPINFO)
hm27k$chrom <- hm27k$CHR
hm27k$name <- hm27k$ID 
library(rtracklayer)
hm27k.gr <- df2GR(hm27k)
genome(hm27k.gr) <- 'hg18'
mcols(hm27k.gr)$name <- names(hm27k.gr)
load('hg18ToHg19.rda') ## from inst/build/
hm27k.lifted <- unlist(liftOver(hm27k.gr, hg18ToHg19))
names(hm27k.lifted) <- mcols(hm27k.lifted)[,1]
genome(hm27k.lifted) <- 'hg19'

## we have 4 probes that just don't map; zero them out
## 
skip <- which(!(rownames(hm27k) %in% names(hm27k.lifted)))
noLift <- setdiff(rownames(hm27k), names(hm27k.lifted))
lifted <- intersect(rownames(hm27k), names(hm27k.lifted))
## 
## hm27k[ noLift, 'CHR' ]
## [3] "17" "7"  "7"  "17"
## 
## Even allowing up to 5 mismatches they still don't map!
## Everything else, update it in the table for hm27k:
##
## identical( rownames(hm27k)[ -skip ], lifted) ## can check if desired
unmappable <- noLift ## load-time message re: dropped probes; will drop from GR
hm27k$CHR[ -skip ] <- gsub('chr','',as(hm27k.lifted, 'data.frame')$seqnames)
hm27k$MAPINFO[ -skip ] <- start(hm27k.lifted)
hm27k$CHR[ which(rownames(hm27k) %in% noLift) ] <- 'Un' ## this becomes 'chrUn'
hm27k$MAPINFO[ which(rownames(hm27k) %in% noLift) ] <- 0
kept <- match(rownames(hm27k)[which(hm27k$PLATFORM=='BOTH')],rownames(hm450k))
hm450k$CHANNEL27[ kept ] = hm27k$CHANNEL27[ which(hm27k$PLATFORM == 'BOTH') ]
hm450k$ADDRESSA_27[ kept ] = hm27k$ADDRESSA_27[which(hm27k$PLATFORM == 'BOTH')]
hm450k$ADDRESSB_27[ kept ] = hm27k$ADDRESSB_27[which(hm27k$PLATFORM == 'BOTH')]

hm27k.allProbes <- hm27k ## in case we need it later, if we screw up
hm27k <- hm27k[ which(hm27k$PLATFORM == 'HM27'), ] ## only old probes
keepColumns <- c('ADDRESSA_450','ADDRESSB_450', 'ADDRESSA_27', 'ADDRESSB_27',
                 'CHANNEL27','CHANNEL450', 'CHR','MAPINFO','SOURCESEQ',
                 'STRAND','PLATFORM','NAME')
infMeth.hg19 <- rbind(hm450k[, keepColumns], hm27k[, keepColumns])

## note that SNP probes do not have MAPINFO; will fix this from dbSNP
noMap <- which(is.na(as.numeric(infMeth.hg19$MAPINFO)))
noMap.ids <- infMeth.hg19$NAME[ noMap ]
message(paste('Note: GEO is missing genomic coordinates for probes',
              paste(noMap.ids, collapse=', ')))
load("../../data/hm450k.rsProbes.rda")
## data(hm450k.rsProbes) # extracted from dbSNP
infMeth.hg19$CHR[noMap] = gsub('chr','',hm450k.rsProbes[noMap.ids,'CHR'])
infMeth.hg19$MAPINFO[ noMap ] = hm450k.rsProbes[ noMap.ids, 'MAPINFO' ]
infMeth.hg19$STRAND[ noMap ] = '*'
droppedProbes.hg19 <- which(infMeth.hg19$CHR %in% c('', 'Un'))
infMeth.hg19 <- infMeth.hg19[ -droppedProbes.hg19, ] 

##
## now build the GRanges that will become the FeatureDb:
##
## scaffolding: 
##
require(Biostrings)
require(GenomicRanges)
sourceSeq <- with(infMeth.hg19, DNAStringSet(SOURCESEQ))
gcContent <- round(letterFrequency(sourceSeq, letters='GC', as.prob=T), 2)
Infinium.GR.hg19 <- with(infMeth.hg19,
                 GRanges(paste0('chr', CHR), 
                         IRanges(as.numeric(MAPINFO), width=1),
                         strand=as.factor(ifelse(STRAND == 'F','-','+')),
                         addressA_450=Rle(ADDRESSA_450),
                         addressB_450=Rle(ADDRESSB_450),
                         addressA_27=Rle(ADDRESSA_27),
                         addressB_27=Rle(ADDRESSB_27),
                         channel450=Rle(as.factor(CHANNEL450)),
                         channel27=Rle(as.factor(CHANNEL27)),
                         probeType=Rle(as.factor(substr(NAME, 1, 2))),
                         percentGC=as.vector(gcContent),
                         platform=Rle(as.factor(PLATFORM)),
                         sourceSeq=DNAStringSet(SOURCESEQ)
                        )
                 )
names(Infinium.GR.hg19) <- infMeth.hg19$NAME
both = setdiff(levels(mcols(Infinium.GR.hg19)$channel450)[1], c('Grn','Red'))
swap = which(levels(mcols(Infinium.GR.hg19)$channel450) == both)
levels(mcols(Infinium.GR.hg19)$channel450)[ swap ] <- 'Both'
genome(Infinium.GR.hg19) <- 'hg19'

## keep track of 5' and 3' ends of probes for masking purposes
##
mcols(Infinium.GR.hg19)$probeStart <- start(Infinium.GR.hg19)
mcols(Infinium.GR.hg19)$probeEnd <- end(resize(Infinium.GR.hg19, 50, 'end')) ## temp.
mcols(Infinium.GR.hg19)$probeTarget <- start(Infinium.GR.hg19)
mcols(Infinium.GR.hg19)$probeExtension <- NA ## temporary
revMe = which(mcols(Infinium.GR.hg19)$probeStart == mcols(Infinium.GR.hg19)$probeEnd)
#browser()
mcols(Infinium.GR.hg19[revMe])$probeStart=start(resize(Infinium.GR.hg19[revMe],49,'end'))
mcols(Infinium.GR.hg19[revMe])$probeEnd=end(resize(Infinium.GR.hg19[revMe],2,'start'))
extend = which(mcols(Infinium.GR.hg19)$probeEnd == start(Infinium.GR.hg19)+1)
mcols(Infinium.GR.hg19[extend])$probeExtension <- start(Infinium.GR.hg19[extend])+1

## fix stranding and check dinucleotide sequence in the reference genome
##
library(BSgenome.Hsapiens.UCSC.hg19)
resizeToStart <- which(
  getSeq(Hsapiens, resize(Infinium.GR.hg19, 2, fix='start'), as.char=T) == 'CG'
)
resizeToEnd <- which(
  getSeq(Hsapiens, resize(Infinium.GR.hg19, 2, fix='end'), as.char=T) == 'CG'
)
Infinium.GR.hg19[resizeToStart] <- resize(Infinium.GR.hg19[resizeToStart], 2, fix='start')
Infinium.GR.hg19[ resizeToEnd ] <- resize(Infinium.GR.hg19[ resizeToEnd ], 2, fix='end')
restranded = length(resizeToEnd) + length(resizeToStart)
unstranded = length(Infinium.GR.hg19) - restranded # not bad! mostly CpH + SNP probes
print(paste(restranded, 'probes restranded, only', unstranded, 'left to go...'))

## fix CpH probes
##
lastLetter <- function(x) substr(x, nchar(x), nchar(x))
strand(Infinium.GR.hg19)[ which(lastLetter(names(Infinium.GR.hg19)) == 'F') ] <- '-'
strand(Infinium.GR.hg19)[ which(lastLetter(names(Infinium.GR.hg19)) == 'R') ] <- '+'
Infinium.GR.hg19 <- resize(Infinium.GR.hg19, 
  width=ifelse(as.vector(mcols(Infinium.GR.hg19)$probeType)=='ch',1,2), 
  fix=ifelse(as.vector(strand(Infinium.GR.hg19))=='+','start','end')
)

## fix SNP probes (again)
##
start(Infinium.GR.hg19[ noMap.ids ]) = hm450k.rsProbes[ noMap.ids, 'MAPINFO' ]
width(Infinium.GR.hg19[ noMap.ids ]) = 1

## commenting out this line retains stranding information in the GRanges
##
## strand(Infinium.GR.hg19) = '*'

## It turns out that the HM27k chip DOES have SNP probes.  Label these:
##
# data(hm27.controls)
load("../../data/hm27.controls.rda")
# data(hm27.SNP.colors)
load("../../data/hm27.SNP.colors.rda")
hm27.SNP.controls <- hm27.controls[grep('^rs', hm27.controls$Name), ]
for(i in unique(hm27.SNP.controls$Name)) {
  i = gsub('_$','',i)
  if( i %in% names(Infinium.GR.hg19) & !is.na(hm27.SNP.colors[i]) ) {
    addresses = hm27.SNP.controls$Address[ grep(i, hm27.SNP.controls$Name) ]
    mcols(Infinium.GR.hg19[ i ])$addressA_27 = addresses[1]
    mcols(Infinium.GR.hg19[ i ])$addressB_27 = addresses[2]
    mcols(Infinium.GR.hg19[ i ])$channel27 = hm27.SNP.colors[i]
    mcols(Infinium.GR.hg19[ i ])$platform = 'BOTH'
  }
}
#Infinium.GR.hg19[na.omit(match(unique(hm27.SNP.controls$Name), names(Infinium.GR.hg19)))]

## Now order the probes based on their name
##
Infinium.GR.hg19 <- Infinium.GR.hg19[order(names(Infinium.GR.hg19))]

## Create the FDb, and save it.
##
FDb.InfiniumMethylation.hg19 = GenomicRangesToFeatureDb(
  Infinium.GR.hg19, 
  URL='ftp://ftp.illumina.com', 
  tableName='InfiniumMethylation', 
  src='NCBI/GEO and dbSNP', 
  label='Illumina Infinium DNA methylation probes, mapped to hg19',
  genus.and.species='Homo sapiens'
)
saveDb(FDb.InfiniumMethylation.hg19, 
       file='FDb.InfiniumMethylation.hg19.sqlite')

## Now verify that it comes out the same as it went in (well, almost)
## The extra lines of code are, in my view, a bug in GenomicFeatures
## Also, the types of variables (Rle, numeric, etc.) are not retained
## Nonetheless, this is a pretty handy structure for storing the data.
##
FDb.InfiniumMethylation.hg19 <- loadDb('FDb.InfiniumMethylation.hg19.sqlite')
infi <- features(FDb.InfiniumMethylation.hg19)
# names(infi) <- mcols(infi)$name
# met <- metadata(FDb.InfiniumMethylation.hg19) ## need to fetch genome
# genome(infi) <- met[ which(met[,'name'] == 'Genome'), 'value' ]
show(infi)

## verify the stranding, start, end, target
foo <- resize(head(infi), 50, fix='end')
all(end(foo) == as.numeric(foo$probeEnd))
all(start(foo) == as.numeric(foo$probeStart))

## restrict to CpG probes (the SNP probes don't behave so well)
bar <- resize(tail(infi[which(substr(names(infi), 1, 2) == 'cg')]),50,fix='end')
all(end(bar) == as.numeric(bar$probeEnd))
all(start(bar) == as.numeric(bar$probeStart))

## If you wanted to compute on the source sequences, e.g. observed/expected CpGs
##
mcols(infi)$sourceSeq <- DNAStringSet(
  mcols(infi)$sourceSeq
)

## To fit GC-dependent intensity models (e.g. for segmentation or preprocessing)
##
mcols(infi)$percentGC <- as.numeric(
  mcols(infi)$percentGC
)

## For a smaller object to save:
##
mcols(infi)$addressA <- Rle(
  as.numeric(mcols(infi)$addressA)
)
mcols(infi)$addressB <- Rle(
  as.numeric(mcols(infi)$addressB)
)
mcols(infi)$probeType <- Rle(
  as.factor(mcols(infi)$probeType)
)
mcols(infi)$platform <- Rle(
  as.factor(mcols(infi)$platform)
)
mcols(infi)$channel <- Rle(
  as.factor(mcols(infi)$channel)
)
mcols(infi)$design <- Rle(
  as.factor(mcols(infi)$design)
)
names(infi) <- mcols(infi)$name
mcols(infi)$name <- NULL
show(infi)

## by probe type:
split(infi, mcols(infi)$probeType)

## by chromosome:
split(infi, seqnames(infi))

## by color channel:
split(infi, mcols(infi)$channel)

## by platform, with a note about 'design':
Infinium.by.platform <- function() {
  on450k <- which(mcols(infi)$platform %in% c('HM450','BOTH'))
  on27k <-  which(mcols(infi)$platform %in% c('HM27','BOTH'))
  GRangesList( HM450=infi[ on450k ],
               HM27=infi[ on27k ] )
}
by.platform <- Infinium.by.platform()
names(by.platform)

## As of BioC-2.13, lining these up with transcripts or other genomic landmarks
## is pretty much trivial (thanks to Marc Carlson's efforts with OrganismDbi):
##
if(FALSE) {

  library(Homo.sapiens)
  txs <- transcripts(Homo.sapiens, col='SYMBOL')
  txs.by.symbol <- function(x) txs[ grep(paste0('^', x, '$'), txs$SYMBOL) ]
  txs.by.symbol('NAT2')
  ## 
  ## GRanges with 1 range and 1 metadata column:
  ##       seqnames               ranges strand |          SYMBOL
  ##          <Rle>            <IRanges>  <Rle> | <CharacterList>
  ##   [1]     chr8 [18248755, 18258723]      + |            NAT2


  ## WT1 genic probes:
  ##
  summary(suppressWarnings(subsetByOverlaps(infi, 
                                            txs.by.symbol('WT1'), 
                                            ignore.strand=TRUE)))
  ##  Length   Class    Mode 
  ##      52 GRanges      S4 

  ## WT1 genic probes on the same strand as WT1:
  summary(suppressWarnings(subsetByOverlaps(infi, 
                                            txs.by.symbol('WT1'), 
                                            ignore.strand=FALSE)))
  ##  Length   Class    Mode 
  ##      26 GRanges      S4 

  ## TP53 promoter probes:
  ## 
  summary(suppressWarnings(subsetByOverlaps(infi, ignore.strand=TRUE,
                                            flank(txs.by.symbol('TP53'), 
                                                  2000))))
  ##  Length   Class    Mode 
  ##      21 GRanges      S4 


  ## (shortened) miR-10A promoter probes:
  ##
  summary(suppressWarnings(subsetByOverlaps(infi, ignore.strand=TRUE, 
                                            flank(txs.by.symbol('MIR10A'), 
                                                  50))))
  ##  Length   Class    Mode 
  ##       1 GRanges      S4 

  ## All things considered, the sensible default is to return an unstranded GR

}

