suppressPackageStartupMessages({
  library(data.table)
  library(ggplot2)
  library(DBI)
  library(duckdb)
  library(ggrepel)
})

.linkedin_dir <- tryCatch(
  dirname(normalizePath(sub("--file=", "", grep("--file=", commandArgs(FALSE), value = TRUE)[1]))),
  error = function(e) getwd()
)

db_path <- file.path(path.expand(Sys.getenv("LAHMANS_DBDIR", "~/Documents/Data/baseball")),
                     "baseball.duckdb")
drv <- duckdb::duckdb()
con <- dbConnect(drv, db_path, read_only = TRUE)
on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE)

sql <- "WITH tenures AS (
    SELECT playerID, teamID,
           MIN(yearID) AS tenure_start,
           MAX(yearID) AS tenure_end,
           COUNT(*) AS tenure_years,
           SUM(salary) AS total_salary
    FROM (
        SELECT s.*,
               s.yearID - ROW_NUMBER()
                   OVER (PARTITION BY s.playerID, s.teamID ORDER BY s.yearID) AS grp
        FROM SalariesAll s
        WHERE s.is_actual = TRUE AND s.salary > 500000
    ) g
    GROUP BY playerID, teamID, grp
    HAVING COUNT(*) >= 2
),
debut AS (
    SELECT playerID, MIN(yearID) AS debut_year
    FROM (
        SELECT playerID, yearID FROM Batting WHERE G > 0
        UNION ALL
        SELECT playerID, yearID FROM Pitching WHERE G > 0
    )
    GROUP BY playerID
),
pt_raw AS (
    SELECT playerID, yearID,
           MAX(G) AS g_bat, MAX(AB) AS ab, 0 AS gs, 0 AS ip_outs
    FROM Batting GROUP BY playerID, yearID
    UNION ALL
    SELECT playerID, yearID, 0, 0, MAX(GS), MAX(IPouts)
    FROM Pitching WHERE GS >= 1 GROUP BY playerID, yearID
),
pt_agg AS (
    SELECT playerID, yearID,
           MAX(g_bat) AS g_bat, MAX(ab) AS ab,
           MAX(gs)    AS gs,    MAX(ip_outs) AS ip_outs
    FROM pt_raw GROUP BY playerID, yearID
),
pt_factor AS (
    SELECT playerID, yearID,
           CASE
               WHEN gs  >= 10 THEN LEAST(1.0, gs / 30.0)
               WHEN ab  >= 50 THEN LEAST(1.0, g_bat / 150.0)
               ELSE NULL
           END AS pt_factor
    FROM pt_agg
    WHERE g_bat >= 20 OR gs >= 5
),
war_adj AS (
    SELECT pw.playerID, pw.yearID, pw.total_war,
           pf.pt_factor,
           CASE WHEN pf.pt_factor > 0.15
                THEN pw.total_war / pf.pt_factor
                ELSE NULL END AS war_if_healthy
    FROM PlayerWAR pw
    JOIN pt_factor pf ON pw.playerID = pf.playerID AND pw.yearID = pf.yearID
),
pre_war AS (
    SELECT t.playerID, t.teamID, t.tenure_start,
           AVG(w.total_war) AS pre_war_avg,
           COUNT(w.yearID)  AS pre_war_n
    FROM tenures t
    JOIN PlayerWAR w ON t.playerID = w.playerID
        AND w.yearID BETWEEN t.tenure_start - 3 AND t.tenure_start - 1
    GROUP BY t.playerID, t.teamID, t.tenure_start
    HAVING COUNT(*) >= 1
),
contract_war AS (
    SELECT t.playerID, t.teamID, t.tenure_start,
           SUM(wa.total_war)                                   AS delivered_war,
           SUM(wa.war_if_healthy)                              AS healthy_war,
           AVG(wa.pt_factor)                                   AS avg_availability,
           SUM(CASE WHEN wa.pt_factor < 0.60 THEN 1 ELSE 0 END) AS injury_seasons
    FROM tenures t
    JOIN war_adj wa ON t.playerID = wa.playerID
        AND wa.yearID BETWEEN t.tenure_start AND t.tenure_end
    GROUP BY t.playerID, t.teamID, t.tenure_start
),
p AS (
    SELECT nameFirst || ' ' || nameLast AS player_name, playerID FROM People
)
SELECT
    t.playerID,
    p.player_name,
    t.teamID,
    t.tenure_start,
    t.tenure_years,
    t.total_salary,
    pw.pre_war_avg,
    cw.delivered_war,
    cw.healthy_war,
    cw.avg_availability,
    cw.injury_seasons,
    pw.pre_war_avg * t.tenure_years                AS implied_war,
    cw.delivered_war  - pw.pre_war_avg * t.tenure_years AS war_delta,
    cw.healthy_war    - pw.pre_war_avg * t.tenure_years AS war_delta_healthy,
    CASE
        WHEN t.tenure_start <= 2002                THEN 'Pre-Moneyball'
        WHEN t.tenure_start BETWEEN 2003 AND 2011  THEN 'Moneyball'
        ELSE 'Big Data'
    END AS era
FROM tenures t
JOIN debut  d  ON t.playerID = d.playerID
JOIN pre_war pw ON t.playerID = pw.playerID AND t.teamID = pw.teamID
                AND t.tenure_start = pw.tenure_start
JOIN contract_war cw ON t.playerID = cw.playerID AND t.teamID = cw.teamID
                     AND t.tenure_start = cw.tenure_start
JOIN p ON t.playerID = p.playerID
WHERE t.tenure_start - d.debut_year >= 6
  AND t.total_salary > 2000000
  AND t.tenure_years >= 2
  AND t.tenure_start BETWEEN 1990 AND 2022
  AND t.tenure_start != 2020"

contracts <- as.data.table(dbGetQuery(con, sql))
cat("Contracts:", nrow(contracts), "\n")


## ---- era league medians (aging-curve baseline) ------------------------------
## Compute separate medians for raw and injury-adjusted war delta
era_medians <- contracts[, .(
    era_median         = median(war_delta,         na.rm = TRUE),
    era_median_healthy = median(war_delta_healthy, na.rm = TRUE)
), by = era]
contracts <- era_medians[contracts, on = "era"]
contracts[, rel_war_delta         := war_delta         - era_median]
contracts[, rel_war_delta_healthy := war_delta_healthy - era_median_healthy]

## ---- team-era aggregates ----------------------------------------------------
## Exclude small-sample historic franchise codes (< 10 contracts across all eras)
team_counts <- contracts[, .N, by = teamID]
active <- team_counts[N >= 10, teamID]

team_era <- contracts[teamID %in% active,
    .(n                   = .N,
      median_rel          = median(rel_war_delta,         na.rm = TRUE),
      median_rel_healthy  = median(rel_war_delta_healthy, na.rm = TRUE),
      avg_rel             = mean(rel_war_delta,           na.rm = TRUE),
      pct_beat            = mean(war_delta > 0,           na.rm = TRUE) * 100,
      avg_availability    = mean(avg_availability,        na.rm = TRUE),
      avg_injury_seas     = mean(injury_seasons,          na.rm = TRUE),
      avg_dpw             = mean(total_salary[delivered_war > 0.5] /
                                 delivered_war[delivered_war > 0.5], na.rm = TRUE) / 1e6,
      total_M             = sum(total_salary,             na.rm = TRUE) / 1e6),
    keyby = .(teamID, era)]

## require >= 3 contracts per era cell to show a dot
team_era <- team_era[n >= 3]

## era factor for ordering
era_levels <- c("Pre-Moneyball", "Moneyball", "Big Data")
team_era[, era := factor(era, levels = era_levels)]

## ---- teams present in all 3 eras (for slope/bump lines) --------------------
era_counts <- team_era[, .N, by = teamID]
all3 <- era_counts[N == 3, teamID]
team_era3 <- team_era[teamID %in% all3]

## rank within each era (higher median_rel = better)
team_era3[, rank := frank(-median_rel, ties.method = "average"), by = era]

## ---- label candidates -------------------------------------------------------
## label top 4 + bottom 4 in the Big Data era, plus a few notable franchises
notable <- c("OAK", "TBA", "HOU", "NYA", "LAA", "BOS", "CHN", "SLN", "TEX", "SDN")
bd_ranks <- team_era3[era == "Big Data"][order(rank)]
label_teams <- union(
  c(head(bd_ranks$teamID, 4), tail(bd_ranks$teamID, 4)),
  notable
)
team_era3[, label := ifelse(teamID %in% label_teams, teamID, NA_character_)]

## Big Data era rank ordering for y-axis
bd_order <- team_era3[era == "Big Data"][order(rank), teamID]
team_era3[, teamID_f := factor(teamID, levels = rev(bd_order))]

## ---- colour: payroll tier (based on avg total_M across eras) ----------------
payroll_tier <- contracts[teamID %in% all3,
    .(avg_spend = mean(total_salary, na.rm = TRUE) / 1e6),
    by = teamID]
payroll_tier[, tier := cut(avg_spend,
    breaks = quantile(avg_spend, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
    labels = c("Small market", "Mid market", "Large market"),
    include.lowest = TRUE)]
team_era3 <- payroll_tier[, .(teamID, tier)][team_era3, on = "teamID"]

## ---- success thresholds by market tier and era ------------------------------
## Small market: made playoffs at least once
## Mid market:   reached World Series at least once
## Large market: won World Series at least once
sql_success <- "
SELECT teamID,
       era_col AS era,
       MAX(CASE WHEN DivWin = 'Y' OR WCWin = 'Y' THEN 1 ELSE 0 END) AS made_playoffs,
       MAX(CASE WHEN LgWin  = 'Y'                 THEN 1 ELSE 0 END) AS made_ws,
       MAX(CASE WHEN WSWin  = 'Y'                 THEN 1 ELSE 0 END) AS won_ws,
       SUM(CASE WHEN DivWin = 'Y' OR WCWin = 'Y' THEN 1 ELSE 0 END) AS playoff_apps,
       SUM(CASE WHEN LgWin  = 'Y'                 THEN 1 ELSE 0 END) AS ws_apps,
       SUM(CASE WHEN WSWin  = 'Y'                 THEN 1 ELSE 0 END) AS ws_wins
FROM (
    SELECT *,
           CASE
               WHEN yearID <= 2002 THEN 'Pre-Moneyball'
               WHEN yearID BETWEEN 2003 AND 2011 THEN 'Moneyball'
               ELSE 'Big Data'
           END AS era_col
    FROM Teams
    WHERE yearID BETWEEN 1990 AND 2024
      AND yearID != 2020
) t
GROUP BY teamID, era_col
"
success_raw <- as.data.table(dbGetQuery(con, sql_success))

team_era3 <- success_raw[team_era3, on = c("teamID", "era")]

## Apply tier-specific success bar
team_era3[, met_success := fcase(
    tier == "Small market" & made_playoffs == 1L, TRUE,
    tier == "Mid market"   & made_ws       == 1L, TRUE,
    tier == "Large market" & won_ws        == 1L, TRUE,
    default = FALSE
)]
team_era3[, success_label := fifelse(
    met_success,
    paste0(tier, "\n(met bar)"),
    paste0(tier, "\n(missed bar)")
)]

## ---- MVA (Management Value Added) scoring ----------------------------------
## success_score: frequency-weighted, tier-adjusted
##   Small market:  playoff_apps*3 + ws_apps*4 + ws_wins*5
##   Mid market:    playoff_apps*2 + ws_apps*3 + ws_wins*4
##   Large market:  playoff_apps*1 + ws_apps*2 + ws_wins*3
## Then z-score each component within era so no tier dominates.
tier_weights <- list(
    "Small market" = c(playoff = 3L, ws = 4L, win = 5L),
    "Mid market"   = c(playoff = 2L, ws = 3L, win = 4L),
    "Large market" = c(playoff = 1L, ws = 2L, win = 3L)
)
w_playoff <- vapply(as.character(team_era3$tier),
                    function(t) tier_weights[[t]]["playoff"], numeric(1L))
w_ws      <- vapply(as.character(team_era3$tier),
                    function(t) tier_weights[[t]]["ws"],      numeric(1L))
w_win     <- vapply(as.character(team_era3$tier),
                    function(t) tier_weights[[t]]["win"],     numeric(1L))
team_era3[, success_raw_score :=
    as.numeric(playoff_apps) * w_playoff +
    as.numeric(ws_apps)      * w_ws      +
    as.numeric(ws_wins)      * w_win]

zscore <- function(x) {
    s <- sd(x, na.rm = TRUE)
    if (is.na(s) || s == 0) return(rep(0, length(x)))
    (x - mean(x, na.rm = TRUE)) / s
}
team_era3[, success_z  := zscore(success_raw_score), by = era]
team_era3[, war_z      := zscore(median_rel_healthy), by = era]
team_era3[, injury_adj := median_rel_healthy - median_rel]
team_era3[, injury_z   := zscore(injury_adj), by = era]
team_era3[, mva_score  := success_z + war_z + injury_z]


## =============================================================================

cat("team_era3:", nrow(team_era3), "rows\n")

## Build teamID -> short name lookup for consistent labels across all charts
short_names_mva <- c(
  NYY = "Yankees",  NYM = "Mets",    LAD = "Dodgers",  BOS = "Red Sox",
  OAK = "Athletics", TBD = "Rays",   HOU = "Astros",   ATL = "Braves",
  ARI = "D-Backs",  CHC = "Cubs",    SFG = "Giants",   STL = "Cardinals",
  PHI = "Phillies", CLE = "Guardians", MIN = "Twins",  DET = "Tigers",
  SEA = "Mariners", TEX = "Rangers", BAL = "Orioles",  TOR = "Blue Jays",
  KCR = "Royals",   MIL = "Brewers", FLA = "Marlins",  CIN = "Reds",
  SDP = "Padres",   COL = "Rockies", ANA = "Angels",   WSN = "Nationals",
  CHW = "White Sox", PIT = "Pirates"
)
fran_map_mva <- as.data.table(dbGetQuery(con, "
  SELECT t.teamID, t.franchID
  FROM (SELECT teamID, franchID,
               ROW_NUMBER() OVER (PARTITION BY teamID ORDER BY yearID DESC) AS rn
        FROM Teams) t WHERE t.rn = 1"))
fran_map_mva[, short := short_names_mva[franchID]]
fran_map_mva[is.na(short), short := franchID]
team_era3 <- merge(team_era3, fran_map_mva[, .(teamID, short)],
                   by = "teamID", all.x = TRUE)
team_era3[is.na(short), short := teamID]

## Source mva_quadrant.R to generate chart
source(file.path(.linkedin_dir, "mva_quadrant.R"))
cat("mva_quadrant.png saved.\n")
