###

library(Biostrings) # for mergeIUPACLetters()

### Performs some basic sanity checking.
checkRawSNPs <- function(rawsnps, seqname)
{
    if (any(is.na(rawsnps$RefSNP_id)))
        stop("some RefSNP ids are NA on ", seqname)
    chr_pos_is_NA <- is.na(rawsnps$chr_pos)
    ## Should never happen because we've already filtered out SNPs with an
    ## unspecified location (see filter3_ds_flat.sh).
    if (any(chr_pos_is_NA)) {
        msg <- c("SNPs with RefSNP ids ",
                 paste(rawsnps$RefSNP_id[chr_pos_is_NA], collapse=", "),
                 " on ", seqname, " still have an unspecified location ",
                 "despite our previous filtering -- dropping them now")
        warning(paste(msg, collapse=""))
        rawsnps <- rawsnps[!chr_pos_is_NA , ]
    }
    rawsnps
}

### Produce a data frame with 3 columns:
###   1. RefSNP_id ("rs" prefix removed)
###   2. alleles_as_ambig: alleles as an IUPAC letter (complemented if strand is -)
###   3. loc: the current chr_pos col
cookRawSNPs <- function(rawsnps, seqname)
{
    ids <- rawsnps$RefSNP_id
    if (!all(substr(ids, 1, 2) == "rs"))
        stop("some RefSNP ids do not start with \"rs\" on ", seqname)
    ids <- substr(ids, 3, nchar(ids))
    alleles <- gsub("/", "", rawsnps$alleles, fixed=TRUE)
    in_minus_strand <- which(rawsnps$strand == "-")
    alleles[in_minus_strand] <- chartr("ACGT", "TGCA", alleles[in_minus_strand])
    alleles_as_ambig <- mergeIUPACLetters(alleles)
    ans <- data.frame(RefSNP_id=ids,
                      alleles_as_ambig=alleles_as_ambig,
                      loc=rawsnps$chr_pos,
                      stringsAsFactors=FALSE)
    ans <- ans[order(ans$loc), ]
    row.names(ans) <- NULL
    ans
}

### Return indices of SNPs that are hidden by a strictly less specific
### SNP occurring at the same location. Also return indices of
### SNPs occurring at the same location with incompatible reported
### alleles.
hiddenByLessSpecificSNP <- function(snplocs)
{
    loc_dups <- duplicated(snplocs$loc)
    dups <- duplicated(snplocs[ , c("alleles_as_ambig", "loc")])
    locs0 <- unique(snplocs$loc[which(loc_dups & !dups)])
    ans <- integer(0)
    for (loc in locs0) {
        have_this_loc <- which(snplocs$loc == loc)
        ambigs <- snplocs$alleles_as_ambig[have_this_loc]
        less_specific <- mergeIUPACLetters(paste(ambigs, collapse=""))
        ans <- c(ans, which(snplocs$loc == loc & snplocs$alleles_as_ambig != less_specific))
    }
    ans
}

### 'shortseqnames' must be a single string (e.g. "20 21 22")
loadAndserializeSNPs <- function(path, shortseqnames, chr_prefix="ch")
{
    seqnames <- paste(chr_prefix, strsplit(shortseqnames, " ", fixed=TRUE)[[1]], sep="")
    SNPcount <- integer(0)
    COLNAMES <- c("RefSNP_id", "alleles", "avg_het", "se_het", "chr", "chr_pos", "strand")
    for (seqname in seqnames) {
        cat("Loading raw SNPs for ", seqname, " ... ", sep="")
        filepath <- file.path(path, paste0(seqname, "_rawsnps.txt"))
        rawsnps <- read.table(filepath, quote="", col.names=COLNAMES,
                              na.strings="?", stringsAsFactors=FALSE)
        cat("OK\n")

        objname <- paste0(seqname, "_snplocs")
        cat("Making ", objname, " ... ", sep="")
        rawsnps <- checkRawSNPs(rawsnps, seqname)
        snplocs <- cookRawSNPs(rawsnps, seqname)
        ## No need to drop inconsistent or redundant SNPs
        #to_drop <- hiddenByLessSpecificSNP(snplocs)
        #if (length(to_drop) != 0) {
        #    cat("Dropping ", length(to_drop), " inconsistent or redundant SNPs ...\n", sep="")
        #    snplocs <- snplocs[-to_drop, ]
        #    row.names(snplocs) <- NULL
        #}
        cat("OK\n")

        SNPcount <- c(SNPcount, nrow(snplocs))

        cat("Saving ", objname, " (data frame with ", nrow(snplocs), " SNPs) ... ", sep="")
        filepath <- paste0(objname, ".rda")
        assign(objname, snplocs, envir=.GlobalEnv)
        save(list=objname, file=filepath, envir=.GlobalEnv)
        cat("OK\n")
    }
    names(SNPcount) <- seqnames
    cat("Saving the SNPcount table ... ")
    assign("SNPcount", SNPcount, envir=.GlobalEnv)
    save(list="SNPcount", file="SNPcount.rda", envir=.GlobalEnv)
    cat("OK\n")
    cat("DONE.\n")
}

