### R-Script supplied with Pirana to perform Type I Error assessment
###
### Required: - Single reference VBE with reference group simulation in Pirana working directory e.g., {vbe_name}_reference_group.csv
###           - One or more test VBE with test group simulations in in Pirana working directory e.g., {vbe_name}_test_group.csv
###           - Optional PE data file (Observations data file associated with Simcyp workspace). Required if selecting 'Compute sigma from obs'.
###
### Description: This R script automates an empirical Type I error assessment for Simcyp VBE simulations. It reads 'reference'
### and 'test' group concentration-time predictions, adds log-normal noise if specified, then computes individual Cmax or AUC
### for each subject. For each candidate sample size per arm, it partitions the data into many mini-trials, applies a log-scale
### two-one-sided test (TOST) to the Cmax or AUC ratios, and records the proportion of trials rejecting bioequivalence (i.e., empirical alpha)
### Finally, it smooths the alpha-versus-sample-size curve and uses ggplot2 to produce either individual or combined plots with the 5% nominal
### line and model comparisons.
###

### <arguments>
###       <gmr_type label="GMR" type="choice">CMax;AUC??CMax</gmr_type>
###       <n_subj_val label="Subjects per arm" type="text">500</n_subj_val>
###       <resample_with_replacement label="Resample with replacement" type="bool">0</resample_with_replacement>
###       <custom_sigma_val label="Sigma" type="text">.2991788</custom_sigma_val>
###       <use_sigma_obs label="Compute sigma from obs" type="bool">0</use_sigma_obs>
###       <obs_scale_val label="Obs scale" type="text">1</obs_scale_val>
###       <time_filter_val label="Filter time" type="text">8902</time_filter_val>
###       <use_combined label="Combine plots" type="bool">1</use_combined>
### </arguments>

### Arguments supplied from Pirana:
arg <- list (
  gmr_type = "CMax",
  n_subj_val = "500",
  resample_with_replacement = "0",
  custom_sigma_val = ".2991788",
  use_sigma_obs = "0",
  obs_scale_val = "1",
  time_filter_val = "8902",
  use_combined = "1"
)


# Setup ----
## Import .vbe and associated files ----
models <- list (
  "vbe_1" = list (
    wksz_file       = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_1_PP3M 350mg.wksz",
    pe_file         = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_1_magnusson_fig6_panel2_350mg.xml",
    description     = "PSD=4.6357",
    working_dir     = "D:/Users/james.craig/VBE/PP3",
    reference_file  = "D:/Users/james.craig/VBE/PP3/vbe_1_reference_group.csv",
    test_file       = "",
    type            = "reference",
    cqa_name        = "idLogNormalRadiusMean",
    cqa_value       = 4.6357

  ),
  "vbe_5" = list (
    wksz_file       = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_5_PP3M 350mg.wksz",
    pe_file         = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_5_magnusson_fig6_panel2_350mg.xml",
    description     = "PSD=2.7762",
    working_dir     = "D:/Users/james.craig/VBE/PP3",
    reference_file  = "",
    test_file       = "D:/Users/james.craig/VBE/PP3/vbe_5_test_group.csv",
    type            = "test",
    cqa_name        = "idLogNormalRadiusMean",
    cqa_value       = 2.7762

  ),
  "vbe_14" = list (
    wksz_file       = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_14_PP3M 350mg.wksz",
    pe_file         = "D:/Users/james.craig/VBE/PP3/pirana_temp/vbe_14_magnusson_fig6_panel2_350mg.xml",
    description     = "PSD=6.823",
    working_dir     = "D:/Users/james.craig/VBE/PP3",
    reference_file  = "",
    test_file       = "D:/Users/james.craig/VBE/PP3/vbe_14_test_group.csv",
    type            = "test",
    cqa_name        = "idLogNormalRadiusMean",
    cqa_value       = 6.8233

  )
)
run_from <- list(software = "pirana", version = "25.7.1")
open_res <- 1

setwd('D:/Users/james.craig/VBE/PP3')

start_time <- Sys.time()
message("Start: ", start_time)

## Process script arguments ----
n_subj_val <- as.numeric(arg$n_subj_val)
use_sigma_obs <- as.logical(as.numeric(arg$use_sigma_obs))
obs_scale_val <- as.numeric(arg$obs_scale_val)
custom_sigma_val  <- as.numeric(arg$custom_sigma_val)
time_filter_val <- as.numeric(arg$time_filter_val)
use_combined <-  as.logical(as.numeric(arg$use_combined))
resample_with_replacement <- as.logical(as.numeric(arg$resample_with_replacement))
gmr_type <- arg$gmr_type

## Validation ----
if (length(models) == 0) {
  stop(
    "Results missing for selected `.vbe`, ensure either {vbe}_reference_group.csv or {vbe}_test_group.csv exist in Pirana directory."
  )
}

reference_model <- models[sapply(models, function(x) x$type == "reference")]
test_models <- models[sapply(models, function(x) x$type == "test")]

# Validate reference models
if (length(reference_model) == 0) {
  stop("Validation error: You must select exactly one 'reference' VBE model, but none were selected.")
} else if (length(reference_model) > 1) {
  stop(sprintf("Validation error: You must select exactly one 'reference' VBE model, but %d were selected.", length(reference_model)))
}

# Validate test models
if (length(test_models) == 0) {
  stop("Validation error: At least one 'test' VBE model must be selected, but none were selected.")
}

## Prepare directories ----
if (!file.exists("pirana_reports")) {
  dir.create ("pirana_reports")
}

if (!file.exists("pirana_temp")) {
  dir.create ("pirana_temp")
}

## Import libraries ----
if(Sys.getenv("R_LIB") != "") { .libPaths(c(Sys.getenv("R_LIB"), .libPaths())) }
# Check required packages
req_pkgs <- c("data.table", "Simcyp", "tidyr", "ggplot2", "ggrepel", "purrr")
missing_pkgs <- req_pkgs[!sapply(req_pkgs, requireNamespace, quietly = TRUE)]
if (length(missing_pkgs)) {
  stop("Package(s) missing: ", paste(missing_pkgs, collapse = ", "),
       "\nInstall with: install.packages(c(",
       paste(shQuote(missing_pkgs), collapse = ", "), "))",
       call. = FALSE)
}

library(ggplot2)

## Create Functions ----
get.AUC <- function(times, conc) {
  dx <- diff(times)
  (colSums((conc[-1, ] + conc[-nrow(conc), ]) * dx)) / 2
}

get.Cmax <- function(conc) {
  return(apply(conc, MARGIN=2, FUN=max))
}

myTOST <- function(logX.ref, logX.test, N) {
  mu.ref  <- mean(logX.ref)
  mu.test <- mean(logX.test)

  var.ref  <- var(logX.ref)
  var.test <- var(logX.test)

  logmeans.diff <- mu.test - mu.ref
  logvars.mean  <- (var.ref + var.test) / 2

  delta <- qt(0.95, df = 2*N - 2) * sqrt(logvars.mean * 2 / N)
  CL_lo <- logmeans.diff - delta
  CL_up <- logmeans.diff + delta

  (exp(CL_lo) > 0.8) && (exp(CL_up) < 1.25)
}

# Analysis ----
# Import reference file and prepare data
reference_data <- data.table::fread(reference_model[[1]]$reference_file)
pe_file <- reference_model[[1]]$pe_file
# Determine sigma to use for log-normal noise
if (use_sigma_obs) {
  message("Computing sigma from observed data (pe_file)...")

  stopifnot(file.exists(pe_file))

  pe_data <- Simcyp::ReadPEData(path = pe_file, scale = obs_scale_val)$Observations
  sim_times <- sort(unique(reference_data$Time))
  subject_ids <- intersect(unique(reference_data$ID), unique(pe_data$SubjectID))

  if (length(subject_ids) == 0) {
    stop("No matching SubjectIDs found between simulated and observed data.")
  }

  all_residuals <- c()

  for (sid in subject_ids) {
    pred <- reference_data[ID == sid, Conc]
    obs <- pe_data[pe_data$SubjectID == sid, ]

    if (nrow(obs) == 0 || length(pred) == 0) next

    pred_interp <- approx(sim_times, pred, obs$Time, rule = 2)$y
    dv_obs <- obs$DV

    if (length(pred_interp) != length(dv_obs)) next

    res <- log(dv_obs) - log(pred_interp)
    all_residuals <- c(all_residuals, res)
  }

  if (length(all_residuals) < 5) {
    warning("Fewer than 5 usable residuals found.")
  }

  sd_residuals <- sd(all_residuals, na.rm = TRUE)
  sigma <- sqrt(log(1 + sd_residuals^2))
  message("Estimated sigma from ",
          length(subject_ids),
          " subject(s): ",
          round(sigma, 4))
} else {
  sigma <- custom_sigma_val
  if (is.na(sigma)) {
    message("No sigma specified. No noise will be added to simulations.")
  } else {
    message("Using user-specified sigma: ", sigma)
  }
}

if (!is.na(sigma)) {
  reference_data[, Conc := rlnorm(.N, meanlog = log(Conc), sdlog = sigma)]
}

reference_data[, ID_prefixed := paste0("RefSubj", ID)]

wide_ref <- data.table::dcast(
  data     = reference_data,
  formula  = Time ~ ID_prefixed,
  value.var= "Conc"
)

# Import test files(s)
model_names <- names(test_models)

if (use_combined) {
  vbe_cache <- list()
}

for (i in seq(model_names)) {
  model     <- model_names[i]
  message("Processing ", model)

  test_data <- data.table::fread(models[[model]]$test_file)

  if (any(unique(reference_data$Time)  != unique(test_data$Time))) {
    warning("Time mismatch between reference and test groups, skipping ", model)
    next
  }

  # # Add noise
  if (!is.na(sigma)) {
  test_data[, Conc := rlnorm(.N, meanlog = log(Conc), sdlog = sigma)]
  }

  test_data[, ID_prefixed := paste0("TestSubj", ID)]

  wide_test <- data.table::dcast(
    data     = test_data,
    formula  = Time ~ ID_prefixed,
    value.var= "Conc"
  )

  # Ensure N subject req
  if (ncol(wide_test) > ncol(wide_ref)) {
    wide_test <- wide_test[, 1:ncol(wide_ref)]
  }

  wide_ref_keep <- wide_ref[, 1:ncol(wide_test)]
  vbe_data <- cbind(wide_ref_keep, wide_test[, -1, with = FALSE])
  vbe_data <- as.data.frame(vbe_data)

  ## Keep only the times in the last dosing period
  last_dosing_time <- time_filter_val
  keep <- which(vbe_data[,1] > last_dosing_time)
  vbe_data <- vbe_data[keep,]

  ## Total number of subjects
  N.subj <- ncol(vbe_data) - 1
  ## Prepare times
  times <- vbe_data$Time
  plot.times <- times / (24 * 7) # in weeks
  N.times <- length(plot.times)
  ##
  ## Prepare computing Cmax and AUC for each subject
  Cmax <- rep(0, N.subj)
  AUC  <- rep(0, N.subj)
  istart   <- 1
  iend     <- N.times
  itime    <- istart:iend
  ##
  ## find Cmax for each subject
  if (gmr_type == "CMax") {
    Cmax <- get.Cmax(vbe_data[itime, 1 + (1:N.subj)])
    logCmax <- log(Cmax)
  } else {
    AUC  <- get.AUC(plot.times[itime], vbe_data[, 1 + (1:N.subj)])
    logAUC  <- log(AUC)
  }

  power <- numeric(n_subj_val)

  for (k in 2:n_subj_val) {
    N.mtc.v <- floor(N.subj / (2 * k))
    BE.yes <- logical(N.mtc.v)

    for (i in seq_len(N.mtc.v)) {
      ref_idx <- sample.int(N.subj / 2, k, replace = resample_with_replacement)
      test_idx <- sample((N.subj / 2 + 1):N.subj, k, replace = resample_with_replacement)

      if (gmr_type == "CMax") {
        logCmax_ref <- logCmax[ref_idx]
        logCmax_test <- logCmax[test_idx]
        BE.yes[i] <-  myTOST(logCmax_ref, logCmax_test, k)
      } else {
        logAUC_ref <- logAUC[ref_idx]
        logAUC_test <- logAUC[test_idx]
        BE.yes[i] <-  myTOST(logAUC_ref, logAUC_test, k)
      }
    }

    power[k] <- mean(BE.yes)
  }


  ## Smooth version of power curve
  smoothed <- supsmu(2:n_subj_val, power[-1])
  # Smallest k where alpha exceed nominal Type I error = 0.05
  threshold_crossing_k <- smoothed$x[which(smoothed$y > 0.05)[1]]
  message("Smallest number of subjects where smoothed alpha > 0.05: ", threshold_crossing_k)

  ## Residual variance
  residual.var <- var(power[2:n_subj_val] - smoothed$y)

  ## Create pirana vbe cache
  vbe_cache_model <- list(power = power,
                    n_subj_val = n_subj_val,
                    smoothed = smoothed,
                    residual.var = residual.var,
                    threshold_crossing_k = threshold_crossing_k
                    )
  saveRDS(vbe_cache_model,
          file = paste0(
            "pirana_temp/",
            names(reference_model),
            "_",
            model,
            "_power.Rds"
          ))

# Create plot ----
  if (!use_combined) {
    plot.df <- data.frame(
      x        = 2:n_subj_val,
      power    = power[-1],
      smoothed = smoothed$y
    )

    p <- ggplot(plot.df, aes(x = x)) +
      geom_line(aes(y = power), color = "grey", linewidth = 0.5) +
      geom_line(aes(y = smoothed), color = "red", linewidth = 1.2) +
      geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
      scale_x_continuous(limits = c(1, n_subj_val)) +
      scale_y_continuous(limits = c(0, 1)) +
      labs(
        title = paste0(gmr_type, " Type I Error vs. Sample Size per Arm"),
        subtitle = paste0(models[[model]]$cqa_name, " = ", models[[model]]$cqa_value),
        x = "Number of subjects per arm",
        y = "Probability of declaring bioequivalence"
      ) +
      theme_minimal()

    fname <-
      paste0("pirana_reports/",
             names(reference_model),
             "_", gmr_type,  "_type_I_error_",
             n_subj_val, "_arm_",
             model,
             ".pdf")

    pdf(fname, width=8, height=6)
    print(p)
    dev.off()
    # open created file
    cat (paste("OUTPUT: ", fname, sep=""))
    if (file.exists(fname) && open_res) {
      if (Sys.info()['sysname'] == 'Windows') { shell.exec(paste(getwd(),"/",fname,sep="")) }  # windows
      else if (Sys.info()['sysname'] == 'Darwin') { system(paste ("open ",fname, sep="")) } # mac
      else { system(paste("xdg-open ", fname, sep=""), ignore.stdout=TRUE, ignore.stderr=TRUE, wait=FALSE) } # linux
    }
  } else {
    vbe_cache[[model]] <- vbe_cache_model
  }
} # end for fname

if (use_combined) {
  # Create combined plot
  fname <-
    paste0("pirana_reports/",
           names(reference_model),
           "_", gmr_type, "_type_I_error_",
           n_subj_val, "_arm_",
           paste0(names(test_models),collapse = "_"),
           ".pdf")


  combined_all <- purrr::map2_df(
    vbe_cache,
    test_models,
    function(obj, mod) {
      # Build a per-test data frame
      data.frame(
        x        = 2:obj$n_subj_val,
        power    = obj$power[-1],
        smoothed = obj$smoothed$y,
        model_desc = paste0(mod$cqa_name, " = ", mod$cqa_value)
      )
    }
  )

  combined_long <- combined_all |>
    tidyr::pivot_longer(
      cols = c(power, smoothed),
      names_to = "curve_type",
      values_to = "value"
    )

  type1_error_deviation <- combined_all |>
    dplyr::group_by(model_desc) |>
    dplyr::summarize(dev = mean(abs(smoothed - 0.05)))
  best_model <- type1_error_deviation$model_desc[which.min(type1_error_deviation$dev)]

  threshold_df <- purrr::map2_df(
    vbe_cache,
    test_models,
    function(obj, mod) {
      data.frame(
        model_desc = paste0(mod$cqa_name, " = ", mod$cqa_value)
        k = obj$threshold_crossing_k
      )
    }
  )

  message("Creating combined power plot")
  pdf(fname, width = 8, height = 6)

  p <- ggplot(combined_long, aes(x = x, y = value)) +
    geom_line(
      aes(color = model_desc, size = curve_type),
      show.legend = TRUE
    ) +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
    scale_size_manual(
      values = c(power = 0.5, smoothed = 1.5),
      guide = "none"  # Turn off legend for size
    ) +
    scale_x_continuous(limits = c(1, n_subj_val)) +
    scale_y_continuous(limits = c(0, 1)) +
    labs(
      title = paste0(gmr_type, " Type I Error vs. Sample Size per Arm"),
      x = "Number of subjects per arm",
      y = "Probability of declaring bioequivalence",
      color = "Model"
    ) +
    theme_minimal() +
    theme(legend.position = "right")

  print(p)

  dev.off()

  cat (paste("OUTPUT: ", fname, sep=""))
  if (file.exists(fname) &&
      open_res) {
    if (Sys.info()['sysname'] == 'Windows') {
      shell.exec(paste(getwd(), "/", fname, sep = ""))
    }  # windows
    else if (Sys.info()['sysname'] == 'Darwin') {
      system(paste ("open ", fname, sep = ""))
    } # mac
    else {
      system(
        paste("xdg-open ", fname, sep = ""),
        ignore.stdout = TRUE,
        ignore.stderr = TRUE,
        wait = FALSE
      )
    } # linux
  }
}

end_time <- Sys.time()
message("Complete: ", end_time)
message(capture.output(end_time - start_time))


