#!/usr/bin/env Rscript
# build_all_data_with_flags.R
#
# Build a single auditable Excel file that joins:
#   - _all_data.xlsx               (851 cores, all per-pathologist read cells)
#   - _first_phase/report_vs_ai.xlsx (851 cores, AI / report / reference dx)
#   - _temp_subjective.RDS         (810 cores actually carried into Phase II)
# and adds explicit per-core flags / status text so the 851 → 810 → 138
# chain is fully transparent.
#
# Output: revision1/extracted_data/all_data_with_flags.xlsx

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(readxl)
  library(writexl)
  library(here)
})

ROOT    <- here::here()
OUT_DIR <- file.path(ROOT, "revision1", "extracted_data")
dir.create(OUT_DIR, recursive = TRUE, showWarnings = FALSE)

# ---------------------------------------------------------------------------
# 1. Load the three sources
# ---------------------------------------------------------------------------
ad <- readxl::read_excel(file.path(ROOT, "_all_data.xlsx"))
ra <- readxl::read_excel(file.path(ROOT, "_first_phase", "report_vs_ai.xlsx"),
                         col_types = "text")
rds <- readRDS(file.path(ROOT, "_temp_subjective.RDS")) |> as.data.frame()
intvl_cols <- vapply(rds, inherits, logical(1), what = "Interval")
rds <- rds[, !intvl_cols]

# Canonical Phase I exclusion list maintained by the senior pathologist at
# the time of the original analysis. Categories: excludeIHC (accidentally
# uploaded IHC stain), excludeDuplicate (rescan of the same physical block),
# excludeWebNotWorking (Paige website failed), exclude (non-prostate tissue),
# include (kept). This is the source of truth for the Phase I cohort
# cleaning that produced the originally reported n = 836.
ex_list <- readxl::read_excel(
  file.path(ROOT, "_archive", "paige_results", "paige-prostate-exclude-list.xlsx")
)
ex_excluded <- ex_list |> filter(include != "include")

cat(sprintf("_all_data.xlsx              : %d rows\n", nrow(ad)))
cat(sprintf("report_vs_ai.xlsx           : %d rows\n", nrow(ra)))
cat(sprintf("_temp_subjective.RDS        : %d rows\n", nrow(rds)))
cat(sprintf("paige-prostate-exclude-list : %d rows (%d marked exclude*)\n",
            nrow(ex_list), nrow(ex_excluded)))

# ---------------------------------------------------------------------------
# 2. Per-cell read counts (8 reader x condition diagnosis cells)
# ---------------------------------------------------------------------------
diag_cols <- c("P1_noAI_Diagnosis", "P2_noAI_Diagnosis",
               "P3_noAI_Diagnosis", "P4_noAI_Diagnosis",
               "P1_withAI_Diagnosis", "P2_withAI_Diagnosis",
               "P3_withAI_Diagnosis", "P4_withAI_Diagnosis")
for (col in diag_cols) {
  ad[[paste0(col, "_filled")]] <- !is.na(ad[[col]])
}
ad$n_reader_cells_filled <- rowSums(!is.na(ad[, diag_cols]))
ad$read_by_all_4_pathologists <- ad$n_reader_cells_filled == 8L

# ---------------------------------------------------------------------------
# 3. Per-cell Gleason fill counts (parseable primary + secondary)
# ---------------------------------------------------------------------------
gleason_cols <- c("P1_noAI_Primary", "P2_noAI_Primary",
                  "P3_noAI_Primary", "P4_noAI_Primary",
                  "P1_withAI_Primary", "P2_withAI_Primary",
                  "P3_withAI_Primary", "P4_withAI_Primary")
parseable <- function(x) !is.na(x) & x != "" & x != "0" & grepl("^[1-5]$", as.character(x))
ad$n_reader_gleason_filled <- rowSums(sapply(gleason_cols, function(c) parseable(ad[[c]])))
ad$gleason_from_every_reader <- ad$n_reader_gleason_filled == 8L

# ---------------------------------------------------------------------------
# 4. Cohort membership flags
# ---------------------------------------------------------------------------
ad$in_phase1_xlsx        <- ad$Slide_Label %in% ra$slide_name
ad$in_phase2_rds         <- ad$Slide_Label %in% rds$Slide_Label

# ---------------------------------------------------------------------------
# 5. Joined Phase I metadata (research dx, patterns, comments)
# ---------------------------------------------------------------------------
ra_join <- ra |>
  transmute(
    Slide_Label = slide_name,
    case_no              = caseNo,
    tissue_control,
    Dx_Paige             = Dx_Paige,
    Dx_Report            = Dx_Report,
    Dx_Research          = Dx_Research,
    paige_pattern1, paige_pattern2,
    report_pattern1, report_pattern2,
    research_pattern1, research_pattern2,
    comment_SB
  )

# Drop conflicting columns from `ad` before the join so the Phase I metadata
# wins (the _all_data.xlsx copies are stale).
ad <- ad |> select(-any_of(c("Dx_Paige", "Dx_Report", "Dx_Research")))

# Concordance fill for research_pattern (mirrors revision1.qmd `features` chunk)
is_empty_pattern <- function(x) is.na(x) | !grepl("^[1-5]$", x)
ra_join <- ra_join |>
  mutate(
    research_pattern1_filled = ifelse(
      Dx_Research == "Present" &
        is_empty_pattern(research_pattern1) &
        grepl("not discrepent", comment_SB, ignore.case = TRUE),
      paige_pattern1, research_pattern1
    ),
    research_pattern2_filled = ifelse(
      Dx_Research == "Present" &
        is_empty_pattern(research_pattern2) &
        grepl("not discrepent", comment_SB, ignore.case = TRUE),
      paige_pattern2, research_pattern2
    )
  )

# Reference Grade Group from filled research_pattern1/2
create_grade <- function(p_col, s_col) {
  p_val <- suppressWarnings(as.numeric(p_col))
  s_val <- suppressWarnings(as.numeric(s_col))
  case_when(
    p_val == 3 & s_val == 3 ~ "1",
    p_val == 3 & s_val == 4 ~ "2",
    p_val == 4 & s_val == 3 ~ "3",
    p_val == 4 & s_val == 4 ~ "4",
    p_val == 3 & s_val == 5 ~ "4",
    p_val == 5 & s_val == 3 ~ "4",
    p_val == 4 & s_val == 5 ~ "5",
    p_val == 5 & s_val == 4 ~ "5",
    p_val == 5 & s_val == 5 ~ "5",
    TRUE                    ~ NA_character_
  )
}
ra_join$reference_grade_group <- create_grade(
  ra_join$research_pattern1_filled,
  ra_join$research_pattern2_filled
)

# AI grade group
ra_join$ai_grade_group  <- create_grade(ra_join$paige_pattern1,  ra_join$paige_pattern2)
ra_join$rep_grade_group <- create_grade(ra_join$report_pattern1, ra_join$report_pattern2)

# ---------------------------------------------------------------------------
# 6. Per-pathologist Grade Group from _all_data
# ---------------------------------------------------------------------------
for (rd in c("P1_noAI", "P2_noAI", "P3_noAI", "P4_noAI",
             "P1_withAI", "P2_withAI", "P3_withAI", "P4_withAI")) {
  p <- ad[[paste0(rd, "_Primary")]]
  s <- ad[[paste0(rd, "_Secondary")]]
  ad[[paste0(rd, "_GG")]] <- create_grade(p, s)
}

reader_keys <- c("P1_noAI", "P2_noAI", "P3_noAI", "P4_noAI",
                 "P1_withAI", "P2_withAI", "P3_withAI", "P4_withAI")
reader_gg_cols <- paste0(reader_keys, "_GG")
ad$n_reader_gg_parseable <- rowSums(!is.na(ad[, reader_gg_cols]))

# For every core, summarise WHICH interpreters lack a parseable Gleason and
# what they diagnosed the core as. Helps explain that "missing Gleason" is
# almost always a diagnostic discordance (interpreter called it benign / IHC
# / consult), not a data-entry omission. We surface the same information for
# the original report and the AI here too — those columns also lack Gleason
# whenever those interpreters did not classify the core as malignant.
no_gg_dx_summary <- vapply(seq_len(nrow(ad)), function(i) {
  pieces <- vapply(reader_keys, function(rd) {
    gg <- ad[[paste0(rd, "_GG")]][i]
    if (!is.na(gg)) return(NA_character_)
    dx <- ad[[paste0(rd, "_Diagnosis")]][i]
    if (is.na(dx)) return(paste0(rd, "=missing"))
    paste0(rd, "=", dx)
  }, character(1))
  pieces <- pieces[!is.na(pieces)]
  if (!length(pieces)) "" else paste(pieces, collapse = "; ")
}, character(1))
ad$readers_without_gleason <- no_gg_dx_summary

# ---------------------------------------------------------------------------
# 7. Build the authoritative inclusion_status column
# ---------------------------------------------------------------------------
# Lookup table from the canonical exclude-list, keyed by slide_name
ex_lookup <- ex_list |>
  transmute(Slide_Label = slide_name,
            exclude_list_status = include)   # values: include / excludeIHC / excludeDuplicate / excludeWebNotWorking / exclude

combined <- ad |>
  left_join(ra_join, by = "Slide_Label") |>
  left_join(ex_lookup, by = "Slide_Label") |>
  mutate(
    # Flags showing whether the AI or the original Report lacks a parseable
    # Gleason — these are "missing" only because that interpreter classified
    # the core as non-malignant.
    ai_lacks_gleason     = is.na(ai_grade_group),
    report_lacks_gleason = is.na(rep_grade_group),
    # All-data definition: every interpreter has a parseable GG. Does NOT
    # require the core to be in the Phase II RDS, so this count can be
    # >138 (the kappa-subset count from revision1.qmd / grade_group_stats.json
    # which IS restricted to RDS rows).
    inter_rater_complete_alldata =
      n_reader_gg_parseable == 8L &
      !is.na(reference_grade_group) &
      !is.na(rep_grade_group) &
      !is.na(ai_grade_group),

    # Canonical kappa subset: same as `gg_combined$inter_rater_complete` in
    # revision1.qmd — restricted to the Phase II RDS so a paired AI-vs-noAI
    # comparison is unbiased. THIS is the n=138 number cited in the paper.
    inter_rater_complete =
      inter_rater_complete_alldata & in_phase2_rds,

    # Documented exclusion reasons (from first_phase_results.qmd and the
    # canonical exclude-list maintained by the senior pathologist at
    # `_archive/paige_results/paige-prostate-exclude-list.xlsx`):
    #  1. excludeDuplicate    — duplicate rescans (mostly c17; also c45, c48, c5, c56, c59, c60)
    #  2. excludeIHC          — accidentally uploaded IHC stain images
    #  3. excludeWebNotWorking — Paige website did not run on the slide
    #  4. exclude             — non-prostate tissue (already filtered upstream)
    case_no_for_label = sub("_.*", "", Slide_Label),

    # Canonical Phase I analytical cohort flag, driven by the
    # exclude-list above. Slides not present in the exclude-list default to
    # "include" (the list only enumerates problem cases).
    phase1_analytical_cohort = case_when(
      is.na(exclude_list_status)                  ~ "include",
      exclude_list_status == "include"            ~ "include",
      exclude_list_status == "excludeDuplicate"   ~ "exclude — duplicate rescan",
      exclude_list_status == "excludeIHC"         ~ "exclude — accidentally uploaded IHC stain",
      exclude_list_status == "excludeWebNotWorking" ~ "exclude — Paige website did not run on this slide",
      exclude_list_status == "exclude"            ~ "exclude — non-prostate tissue / other",
      TRUE                                        ~ "exclude — other"
    ),

    inclusion_status = case_when(
      phase1_analytical_cohort != "include"            ~ paste("EXCLUDED (Phase I cohort) —",
                                                                sub("^exclude — ", "", phase1_analytical_cohort)),
      n_reader_cells_filled == 0L                      ~ "Phase I include — no Phase II read recorded (slide eligible but never read)",
      n_reader_cells_filled %in% 1:6                   ~ "Phase II — partial reads only",
      n_reader_cells_filled == 7L                      ~ "Phase II — incomplete (7-of-8 reader cells)",
      n_reader_cells_filled == 8L & !in_phase2_rds     ~ "Phase I include — fully read but not in Phase II RDS (data-cleaning)",
      n_reader_cells_filled == 8L & in_phase2_rds &
        Dx_Research == "Absent"                        ~ "Phase II — benign core, fully read by all 4 pathologists (Gleason not applicable)",
      n_reader_cells_filled == 8L & in_phase2_rds &
        Dx_Research == "ASAP"                          ~ "Phase II — ASAP core, fully read by all 4 pathologists (Gleason not applicable)",
      n_reader_cells_filled == 8L & in_phase2_rds &
        Dx_Research == "Present" & !inter_rater_complete ~ "Phase II — adenocarcinoma core, fully read; ≥1 interpreter classified it as benign/IHC/consult so did not enter a Gleason (diagnostic discordance, not missing data)",
      n_reader_cells_filled == 8L & in_phase2_rds &
        inter_rater_complete                           ~ "Phase II — inter-rater complete-cases subset (used for Fleiss kappa)",
      TRUE                                             ~ "?"
    )
  )

# ---------------------------------------------------------------------------
# 8. Lay out the columns nicely for the audit Excel
# ---------------------------------------------------------------------------
audit <- combined |>
  transmute(
    Slide_Label,
    case_no,
    tissue_control,
    Dx_Paige,
    Dx_Report,
    Dx_Research,
    comment_SB,

    # Reference / AI / Report grade groups (Phase I)
    reference_grade_group,
    ai_grade_group,
    rep_grade_group,
    research_pattern1_filled,
    research_pattern2_filled,
    paige_pattern1, paige_pattern2,
    report_pattern1, report_pattern2,

    # Per-pathologist diagnosis cells
    P1_noAI_Diagnosis,   P1_withAI_Diagnosis,
    P2_noAI_Diagnosis,   P2_withAI_Diagnosis,
    P3_noAI_Diagnosis,   P3_withAI_Diagnosis,
    P4_noAI_Diagnosis,   P4_withAI_Diagnosis,

    # Per-pathologist grade groups
    P1_noAI_GG,   P1_withAI_GG,
    P2_noAI_GG,   P2_withAI_GG,
    P3_noAI_GG,   P3_withAI_GG,
    P4_noAI_GG,   P4_withAI_GG,

    # Cohort flags
    n_reader_cells_filled,
    n_reader_gg_parseable,
    read_by_all_4_pathologists,
    gleason_from_every_reader,
    readers_without_gleason,
    ai_lacks_gleason,
    report_lacks_gleason,
    in_phase1_xlsx,
    in_phase2_rds,
    inter_rater_complete_alldata,
    inter_rater_complete,
    exclude_list_status,
    phase1_analytical_cohort,
    inclusion_status
  ) |>
  arrange(case_no, Slide_Label)

# ---------------------------------------------------------------------------
# 9. Build a one-page summary sheet
# ---------------------------------------------------------------------------
summary_tbl <- audit |>
  count(inclusion_status, name = "n_cores") |>
  arrange(desc(n_cores))

cat("\n=== Inclusion status summary ===\n")
print(summary_tbl)

phase1_breakdown <- audit |> count(phase1_analytical_cohort, sort = TRUE)
n_p1_include <- sum(audit$phase1_analytical_cohort == "include")
n_p1_excluded <- nrow(audit) - n_p1_include

cat(sprintf("\nTotal cores: %d (Phase I as uploaded)\n", nrow(audit)))
cat("  Phase I analytical cohort breakdown:\n")
print(phase1_breakdown)
cat(sprintf("\n  Phase I analytical cohort (include)            : %d\n", n_p1_include))
cat(sprintf("  Phase I excluded (per canonical exclude-list)   : %d\n", n_p1_excluded))
cat(sprintf("  Read by all 4 pathologists in both conditions  : %d\n",
            sum(audit$read_by_all_4_pathologists)))
cat(sprintf("  In Phase II RDS                                : %d\n",
            sum(audit$in_phase2_rds)))
cat(sprintf("  Inter-rater complete-cases (138 subset)        : %d\n",
            sum(audit$inter_rater_complete)))

# Cohort lineage with the exclude-list categories
n_excl_dup    <- sum(audit$phase1_analytical_cohort == "exclude — duplicate rescan")
n_excl_ihc    <- sum(audit$phase1_analytical_cohort == "exclude — accidentally uploaded IHC stain")
n_excl_web    <- sum(audit$phase1_analytical_cohort == "exclude — Paige website did not run on this slide")
n_excl_other  <- sum(audit$phase1_analytical_cohort == "exclude — non-prostate tissue / other")

cohort_lineage <- tibble::tribble(
  ~step,                                                                        ~n,
  "Phase I as uploaded (_all_data.xlsx + report_vs_ai.xlsx)",                   nrow(audit),
  "  Excluded: duplicate rescans",                                             -n_excl_dup,
  "  Excluded: accidentally uploaded IHC stain images",                        -n_excl_ihc,
  "  Excluded: Paige website did not run",                                     -n_excl_web,
  "  Excluded: non-prostate tissue / other",                                   -n_excl_other,
  "Phase I analytical cohort (canonical Phase I denominator)",                  n_p1_include,
  "  Excluded between Phase I and Phase II (data-cleaning / partial reads)",   n_p1_include - sum(audit$in_phase2_rds & audit$phase1_analytical_cohort == "include"),
  "Phase II RDS (used for AI-effect analyses)",                                 sum(audit$in_phase2_rds),
  "  Excluded: at least one interpreter without parseable Gleason",             sum(audit$in_phase2_rds) - sum(audit$inter_rater_complete),
  "Inter-rater complete-cases (Fleiss kappa subset)",                           sum(audit$inter_rater_complete)
)

# Why excluded — by case (inclusion_status text)
case_breakdown <- audit |>
  filter(grepl("^EXCLUDED", inclusion_status)) |>
  count(case_no, inclusion_status) |>
  arrange(case_no)

# Phase I analytical cohort exclusions — name every excluded slide
phase1_exclusions <- audit |>
  filter(phase1_analytical_cohort != "include") |>
  select(Slide_Label, case_no, phase1_analytical_cohort,
         Dx_Paige, Dx_Report, Dx_Research) |>
  arrange(phase1_analytical_cohort, case_no, Slide_Label)

# ---------------------------------------------------------------------------
# 10. Save
# ---------------------------------------------------------------------------
out_path <- file.path(OUT_DIR, "all_data_with_flags.xlsx")
writexl::write_xlsx(
  list(
    "Per-core audit (n=851)"            = audit,
    "Inclusion status summary"          = summary_tbl,
    "Cohort lineage 851->832->810->138" = cohort_lineage,
    "Excluded cores by case"            = case_breakdown,
    "Phase I cohort exclusions (n=19)"  = phase1_exclusions
  ),
  path = out_path
)
cat(sprintf("\nWrote: %s\n", out_path))