#' Report loci containing secondary SNPs in sequence tags 
#'
#' SNP datasets generated by DArT include fragments with more than one SNP (that is, with secondaries) and record them separately with the same CloneID (=AlleleID).
#' These multiple SNP loci within a fragment are likely to be linked, and so you may wish to remove secondaries.
#' 
#' The script reports statistics associated with secondaries, and the consequences of filtering them out, and provides
#' three plots. The first is a Box and Whisker plot adjusted to account for skewness, the second is a bargraph of
#' the frequency of secondaries per sequence tag, and the third is Poisson expectation for those frequencies
#' including an estimate of the zero class (no. of sequence tags with no SNP scored).
#' 
#' Heterozygosity in gl.report.heterozygosity is in a sense relative, because it is calculated
#' against a background of only those loci that are polymorphic somewhere in the dataset.
#' To allow intercomparability across studies and species, any measure of heterozygosity
#' needs to accommodate loci that are invariant. However, the number of invariant loci
#' are unknown given the SNPs are detected as single point mutational variants and invariant seqeunces are discarded, and because of
#' the particular additional filtering pre-analysis. Modelling the counts
#' of SNPs per sequence tag as a Poisson distribution in this script allows estimate of the zero class,
#' that is, the number of invariant loci. This is reported, and the veracity of the 
#' estimate can be assessed by the correspondence of the observed frequencies against
#' those under Poisson expectation in the associated graphs. The number of invariant loci can then be optionally
#' provided to gl.report.heterozygosity via the parameter n.invariants.
#'
#' @param x -- name of the genlight object containing the SNP data [required]
#' @param boxplot -- if 'standard', plots a standard box and whisker plot; if 'adjusted',
#' plots a boxplot adjusted for skewed distributions [default 'adjusted']
#' @param range -- specifies the range for delimiting outliers [default = 1.5 interquartile ranges]
#' @param verbose -- verbosity: 0, silent or fatal errors; 1, begin and end; 2, progress log ; 3, progress and results summary; 5, full report [default 2 or as specified using gl.set.verbosity]
#' @return returns a genlight object of loci with multiple SNP calls
#' @importFrom adegenet glPlot
#' @importFrom graphics barplot
#' @importFrom robustbase adjbox
#' @importFrom stats dpois
#' @export
#' @author Arthur Georges (Post to \url{https://groups.google.com/d/forum/dartr})
#' @examples
#' out <- gl.report.secondaries(bandicoot.gl)

gl.report.secondaries <- function(x, 
                                  boxplot="adjusted",
                                  range=1.5,
                                  verbose = 2) {

# TRAP COMMAND, SET VERSION
  
  funname <- match.call()[[1]]
  build <- "Jacob"
  
# SET VERBOSITY
  
  if (is.null(verbose)){ 
    if(!is.null(x@other$verbose)){ 
      verbose <- x@other$verbose
    } else { 
      verbose <- 2
    }
  } 
  
  if (verbose < 0 | verbose > 5){
    cat(paste("  Warning: Parameter 'verbose' must be an integer between 0 [silent] and 5 [full report], set to 2\n"))
    verbose <- 2
  }
  
# FLAG SCRIPT START
  
  if (verbose >= 1){
    if(verbose==5){
      cat("Starting",funname,"[ Build =",build,"]\n")
    } else {
      cat("Starting",funname,"\n")
    }
  }
  
# STANDARD ERROR CHECKING
  
  if(class(x)!="genlight") {
    stop("Fatal Error: genlight object required!")
  }
  
  if (all(x@ploidy == 1)){
    stop("  Detected Presence/Absence (SilicoDArT) data. This filter is not available for Presence/Absence data.\n")
  } else if (all(x@ploidy == 2)){
    cat("  Processing a SNP dataset\n")
  } else {
    stop("Fatal Error: Ploidy must be universally 1 (fragment P/A data) or 2 (SNP data)!")
  }
  
# SCRIPT SPECIFIC ERROR CHECKING

  if (!(boxplot=="standard" | boxplot=="adjusted")) {
    cat("Warning: Box-whisker plots must either standard or adjusted for skewness, set to boxplot='adjusted'\n")
    boxplot <- 'adjusted'   
  }
    
# DO THE JOB

  # Extract the clone ID number
  a <- strsplit(as.character(x@other$loc.metrics$AlleleID),"\\|")
  b <- unlist(a)[ c(TRUE,FALSE,FALSE) ]
  if (verbose >= 2) {
    cat("Counting ....\n")
  }
  x.secondaries <- x[,duplicated(b)]
     # Work around a bug in adegenet if genlight object is created by subsetting
     x.secondaries@other$loc.metrics <- x.secondaries@other$loc.metrics[1:nLoc(x),]

  nloc.with.secondaries <- table(duplicated(b))[2]
  if (!is.na(nloc.with.secondaries)){
    # Prepare for plotting
    # Save the prior settings for mfrow, oma, mai and pty, and reassign
    op <- par(mfrow = c(3, 1), oma=c(1,1,1,1), mai=c(0.5,0.5,0.5,0.5),pty="m")
    # Set margins for first plot
    par(mai=c(0.6,0.5,0.5,0.5))
    # Plot Box-Whisker plot
    robustbase::adjbox(c(0,as.numeric(table(b))),
           horizontal = TRUE,
           col='red',
           range=range,
           main = "Box and Whisker Plot")
    # Set margins for second plot
    par(mai=c(0.5,0.5,0.2,0.5))  
    # Plot Histogram
    freqs <- c(0,table(as.numeric(table(b))))
    f <- as.table(freqs)
    names(f)<- seq(1:(length(f)))-1
    barplot(f,col="red", space=0.5, main="Observed Frequency of SNPs per Sequence Tag")
    
    # Plot Histogram with estimate of the zero class
    if (verbose >= 2){
      cat("Estimating parameters (lambda) of the Poisson expectation\n")
    }
      # Calculate the mean for the truncated distribution
        freqs <- as.numeric(freqs)
        tmp<- NA
        for (i in 1:length(freqs)){
          tmp[i] <- freqs[i]*(i-1)
        }
        tmean <- sum(tmp)/sum(freqs)
      # Set a random seed, close to 1
        seed <- tmean
      # Set convergence criterion
        delta <- 0.00001
      # Use the mean of the truncated distribution to compute lambda for the untruncated distribution
        k <- seed
        for (i in 1:100){
          if (verbose >= 2){print(k)}
          k.new <- tmean*(1-exp(-k))
          if (abs(k.new - k) <= delta){
            if (verbose >= 2){cat("Converged on Lambda of",k.new,"\n")}
            fail <- FALSE
            break
          }
          if (i == 100){
            if(verbose >= 2){cat("Failed to converge: No reliable estimate of invariant loci\n")}
            fail <- TRUE
            break
          }
          k <- k.new
        }
        
      # Size of the truncated distribution
        if (!fail) {
          n <- sum(freqs)  # Size of the truncated set 
          tp <- 1 - dpois( x=0, lambda=k ) # Fraction that is the truncated set
          rn <- round(n/tp,0) # Estimate of the whole set
          cat("Estimated size of the zero class",round(dpois(x=0,lambda=k)*rn,0),"\n")
          # Table for the reconstructed set  
            reconstructed <- dpois( x=0:(length(freqs)-1), lambda=k )*rn
            reconstructed <- as.table(reconstructed)
            names(reconstructed)<- seq(1:(length(reconstructed)))-1
          # Set margins for third plot
            par(mai=c(0.5,0.5,0.2,0.5))
            title <- paste0("Poisson Expectation (zero class ",round(dpois(x=0,lambda=k)*rn,0)," invariant loci)")
            barplot(reconstructed,col="red", space=0.5, main=title)
        }
  } else {
      cat("  Warning: No loci with secondaries, no plot produced\n")
  }
  
# Identify secondaries in the genlight object
  cat("  Total number of SNP loci scored:",nLoc(x),"\n")
  if (is.na(table(duplicated(b))[2])) {
    cat("    Number of secondaries: 0 \n")
  } else {
      cat("   Number of sequence tags in total:",table(duplicated(b))[1],"\n")
      if (fail){
        cat("    Number of invariant sequence tags cannot be estimated\n")
      } else {
        cat("   Estimated number of invariant sequence tags:", round(dpois(x=0,lambda=k)*rn,0),"\n")
      }  
      cat("    Number of sequence tags with secondaries:",sum(table(as.numeric(table(b))))-table(as.numeric(table(b)))[1],"\n")
      cat("    Number of secondary SNP loci that would be removed on filtering:",table(duplicated(b))[2],"\n")
      cat("    Number of SNP loci that would be retained on filtering:",table(duplicated(b))[1],"\n")
      if(verbose >= 3){cat(" Tabular 1 to K secondaries (refer plot)\n",table(as.numeric(table(b))),"\n")}
  }  

  # Reset the par options    
  if (!is.na(nloc.with.secondaries)){par(op)}
  
# FLAG SCRIPT END
  
  if (verbose >= 1) {
    cat("Completed:",funname,"\n")
  }
  
  if(verbose >= 2){cat("\nReturning a genlight object containing only those loci with secondaries (multiple entries per locus)\n\n")}
  return(x.secondaries)

}  
