# ============================================================
# Data Cleaning: Shaded Lights Experiment (3 Lights)
# ============================================================
# Output: long-format dataset, one row per subject × treatment × guess
# ============================================================
#
# ------------------------------------------------------------
# VARIABLE CODEBOOK
# ------------------------------------------------------------
#
# UNIT OF OBSERVATION
#   Each row is one prediction (guess) made by one subject in one treatment.
#   A subject makes 16 guesses per treatment and completes 5 treatments,
#   giving 80 rows per subject (80 subjects × 80 = 6400 rows total).
#
# SUBJECT IDENTIFIERS
#   subject_id          : Anonymous sequential integer ID (1–80), replacing
#                         the Prolific ID. Consistent across all rows of a subject.
#
# TREATMENT
#   treatment           : The rule governing which light configuration causes
#                         sound. Factor with 5 levels. The rule depends on the
#                         RED and BLUE lights only; the GREEN light is irrelevant.
#                           AND     – sound iff red ON and blue ON
#                           OR      – sound iff red ON or blue ON (at least one)
#                           INHIBIT – sound iff red ON and blue OFF
#                           EITHER  – sound iff exactly one of red/blue ON (XOR)
#                           JOINT   – sound iff red and blue same state (XNOR)
#   round_order         : Position in which the subject encountered this
#                         treatment (1 = first, 5 = last). Treatments are
#                         randomized across subjects, so the same treatment
#                         may appear in different positions for different subjects.
#
# TRIAL-LEVEL VARIABLES (vary within subject × treatment)
#   Guess_Number        : Index of the prediction within the treatment (1–16).
#   Light_Config        : Observable light state shown to the subject for this
#                         prediction, formatted as "(red,blue,green)" where
#                         1 = ON and 0 = OFF.
#                         Possible values: (0,0,0), (0,0,1), (0,1,0), (0,1,1),
#                                          (1,0,0), (1,0,1), (1,1,0), (1,1,1).
#   red_light           : State of the red light (1 = ON, 0 = OFF).
#   blue_light          : State of the blue light (1 = ON, 0 = OFF).
#   green_light         : State of the green light (1 = ON, 0 = OFF).
#                         Green light does NOT affect the correct prediction.
#   Guess               : The subject's prediction on this trial (1 = predicts
#                         sound, 0 = predicts no sound).
#   Machine_CorrectP    : The correct prediction a fully-informed subject should
#                         make given the true rule for this treatment and the
#                         current red/blue light configuration (1 = predict sound,
#                         0 = predict no sound). Green light is irrelevant to all
#                         rules, so Machine_CorrectP depends only on red and blue.
#
# TREATMENT-LEVEL VARIABLES (same value for all 16 guesses within a treatment)
#   time_machine        : Time (in seconds) the subject spent on the observation
#                         page for this treatment.
#   Notes               : Free-text notes the subject wrote during this treatment.
#   certainty           : Subject's self-reported confidence in their predictions
#                         for this treatment (numeric scale 0–100).
#   difficulty          : Subject's self-reported difficulty of this treatment
#                         (numeric scale 1–10). Collected only after the final
#                         (5th) treatment and filled to all rounds within each subject.
#   difficulty_certainty: Additional difficulty/certainty rating (0–100).
#                         Same collection timing as difficulty.
#   predicted_correct_self : Subject's prediction of how many of their own
#                         guesses were correct (numeric, out of 16).
#
# SUBJECT-LEVEL VARIABLES (same value for all rows of a subject)
#   STRATEGY            : Free-text strategy description. Collected once at end.
#   COMMENTS            : Free-text final comments. Collected once at end.
#   num_wrong           : Number of errors in the comprehension check.
#   final_payment       : Total payment in GBP (participation fee + bonus).
#   payment_machine     : The treatment (machine) randomly selected for payment.
#   education           : Highest education level completed (Prolific demographics).
#   age                 : Subject's age in years (Prolific demographics).
#   time_total_min      : Total time (minutes) from experiment start to end of
#                         the last Combined_3lights task page.
#
# TECHNICAL
#   original_color      : Permutation index for light display (0–5). In this
#                         dataset all subjects have original_color = 0, which
#                         corresponds to the standard layout:
#                           col0 = Red light, col1 = Blue light, col2 = Green light.
#                         Since RANDOMIZE_COLORS = 0, no column remapping is needed.
# ------------------------------------------------------------

#install.packages("tidyverse")
library(tidyverse)

# ---- File paths -------------------------------------------
# Paths are relative to the location of this script
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)

path_main <- file.path(script_dir, "3lights_simulation.csv")

# ---- 1. Read raw data ------------------------------------
# 3lights_simulation.csv is in super-wide format: one row per participant,
# with all 5 rounds spread across columns (Combined_3lights.N.player.*).
# Payment info is embedded as Pay.1.player.* columns.
raw <- read_csv(path_main, show_col_types = FALSE)

# ---- 2. Drop rows with no Prolific ID or no guesses -------
guess_cols <- grep("^Combined_3lights\\.\\d+\\.player\\.guess\\d+$",
                   names(raw), value = TRUE)
raw <- raw %>%
  filter(
    !is.na(`Combined_3lights.1.player.ID_subject`),
    `Combined_3lights.1.player.ID_subject` != "",
    str_detect(`Combined_3lights.1.player.ID_subject`, "^[0-9a-f]{24}$"),
    if_any(all_of(guess_cols), ~ !is.na(.x) & .x != "")
  )

# ---- 3. Create subject ID mapping -------------------------
# Prolific ID is only in Combined_3lights.1 (round 1 column).
id_map <- raw %>%
  distinct(participant.code, `Combined_3lights.1.player.ID_subject`) %>%
  arrange(`Combined_3lights.1.player.ID_subject`) %>%
  mutate(subject_id = row_number()) %>%
  rename(prolific_id = `Combined_3lights.1.player.ID_subject`) %>%
  select(participant.code, subject_id, prolific_id)

raw <- left_join(raw, id_map, by = "participant.code")

# ---- 4. Correct-prediction lookup table -------------------
# For each treatment × observable light configuration (red, blue, green),
# define Machine_CorrectP — the correct prediction if the subject knew the rule.
#
# IMPORTANT: All five rules depend only on RED and BLUE lights.
#            The GREEN light is irrelevant to every rule.
#            Machine_CorrectP is therefore identical for the two rows that
#            share the same (red, blue) state regardless of green.
#
# Summary of correct predictions per task:
#   AND:     sound iff red=1 AND blue=1
#   OR:      sound iff red=1 OR blue=1 (at least one)
#   INHIBIT: sound iff red=1 AND blue=0
#   EITHER:  sound iff red ≠ blue (XOR)
#   JOINT:   sound iff red = blue (XNOR)

correct_rules <- tribble(
  ~treatment,  ~red, ~blue, ~green, ~Machine_CorrectP,

  # AND: sound iff BOTH red ON and blue ON (green irrelevant)
  "AND",  0, 0, 0,  0,
  "AND",  0, 0, 1,  0,
  "AND",  0, 1, 0,  0,
  "AND",  0, 1, 1,  0,
  "AND",  1, 0, 0,  0,
  "AND",  1, 0, 1,  0,
  "AND",  1, 1, 0,  1,
  "AND",  1, 1, 1,  1,

  # OR: sound iff AT LEAST ONE of red, blue ON (green irrelevant)
  "OR",   0, 0, 0,  0,
  "OR",   0, 0, 1,  0,
  "OR",   0, 1, 0,  1,
  "OR",   0, 1, 1,  1,
  "OR",   1, 0, 0,  1,
  "OR",   1, 0, 1,  1,
  "OR",   1, 1, 0,  1,
  "OR",   1, 1, 1,  1,

  # INHIBIT: sound iff RED ON and BLUE OFF (green irrelevant)
  "INHIBIT",  0, 0, 0,  0,
  "INHIBIT",  0, 0, 1,  0,
  "INHIBIT",  0, 1, 0,  0,
  "INHIBIT",  0, 1, 1,  0,
  "INHIBIT",  1, 0, 0,  1,
  "INHIBIT",  1, 0, 1,  1,
  "INHIBIT",  1, 1, 0,  0,
  "INHIBIT",  1, 1, 1,  0,

  # EITHER: sound iff EXACTLY ONE of red, blue ON — XOR (green irrelevant)
  "EITHER",   0, 0, 0,  0,
  "EITHER",   0, 0, 1,  0,
  "EITHER",   0, 1, 0,  1,
  "EITHER",   0, 1, 1,  1,
  "EITHER",   1, 0, 0,  1,
  "EITHER",   1, 0, 1,  1,
  "EITHER",   1, 1, 0,  0,
  "EITHER",   1, 1, 1,  0,

  # JOINT: sound iff red = blue (both OFF or both ON — XNOR) (green irrelevant)
  "JOINT",    0, 0, 0,  1,
  "JOINT",    0, 0, 1,  1,
  "JOINT",    0, 1, 0,  0,
  "JOINT",    0, 1, 1,  0,
  "JOINT",    1, 0, 0,  0,
  "JOINT",    1, 0, 1,  0,
  "JOINT",    1, 1, 0,  1,
  "JOINT",    1, 1, 1,  1
)

# ---- 5. Subject-level text variables ----------------------
# Strategy and final comments may be filled in any round; take the first
# non-empty value across all 5 rounds.
subject_text <- raw %>%
  mutate(
    STRATEGY = coalesce(
      na_if(as.character(`Combined_3lights.1.player.prediction_strategy`), ""),
      na_if(as.character(`Combined_3lights.2.player.prediction_strategy`), ""),
      na_if(as.character(`Combined_3lights.3.player.prediction_strategy`), ""),
      na_if(as.character(`Combined_3lights.4.player.prediction_strategy`), ""),
      na_if(as.character(`Combined_3lights.5.player.prediction_strategy`), "")
    ),
    COMMENTS = coalesce(
      na_if(as.character(`Combined_3lights.1.player.final_comments`), ""),
      na_if(as.character(`Combined_3lights.2.player.final_comments`), ""),
      na_if(as.character(`Combined_3lights.3.player.final_comments`), ""),
      na_if(as.character(`Combined_3lights.4.player.final_comments`), ""),
      na_if(as.character(`Combined_3lights.5.player.final_comments`), "")
    )
  ) %>%
  select(participant.code, STRATEGY, COMMENTS)

# ---- 6. Comprehension check errors ------------------------
# num_wrong is only populated in round 1
cq_errors <- raw %>%
  select(participant.code,
         num_wrong = `Combined_3lights.1.player.num_wrong`) %>%
  mutate(num_wrong = as.integer(num_wrong))

# ---- 7. Payment info from embedded Pay columns ------------
pay_clean <- raw %>%
  mutate(
    final_payment   = as.numeric(session.config.participation_fee) +
                      as.numeric(`Pay.1.player.bonus_total`),
    payment_machine = str_match(`Pay.1.player.payment_details`,
                                '"machine"\\s*:\\s*"([^"]+)"')[, 2]
  ) %>%
  select(participant.code, final_payment, payment_machine)

# ---- 7b. Demographics from Prolific export ----------------
# NOTE: For simulated data this file will not exist;
#       demographics will be NA for all subjects.
path_demo <- file.path(script_dir, "Demografic_3lights.csv")
if (file.exists(path_demo)) {
  demographics <- read_csv(path_demo, show_col_types = FALSE) %>%
    select(
      prolific_id = `Participant id`,
      education   = `Highest education level completed`,
      age         = Age
    ) %>%
    distinct(prolific_id, .keep_all = TRUE)
} else {
  demographics <- tibble(prolific_id = character(),
                         education   = character(),
                         age         = integer())
}

# ---- 7c. Total time in experiment -------------------------
# End time = page_load_ts of round 5 + time_on_page of round 5.
time_total <- raw %>%
  mutate(
    time_total_sec = (as.numeric(as.POSIXct(`Combined_3lights.5.player.page_load_ts`,
                                             origin = "1970-01-01")) +
                      as.numeric(`Combined_3lights.5.player.time_on_page`)) -
                     as.numeric(as.POSIXct(participant.time_started_utc,
                                           origin = "1970-01-01")),
    time_total_min = time_total_sec / 60
  ) %>%
  select(participant.code, time_total_min)

# ---- 8. Reshape wide → long (one row per participant × round) ----
# Each round n has columns Combined_3lights.n.player.*.
# Extract and stack into a uniform long format.
select_round <- function(df, n) {
  prefix <- paste0("Combined_3lights.", n, ".")

  rename_map <- c(
    "player.machine_name"           = paste0(prefix, "player.machine_name"),
    "subsession.round_number"       = paste0(prefix, "subsession.round_number"),
    "player.original_color"         = paste0(prefix, "player.original_color"),
    "player.time_on_page"           = paste0(prefix, "player.time_on_page"),
    "player.notes"                  = paste0(prefix, "player.notes"),
    "player.certainty"              = paste0(prefix, "player.certainty"),
    "player.difficulty"             = paste0(prefix, "player.difficulty"),
    "player.difficulty_certainty"   = paste0(prefix, "player.difficulty_certainty"),
    "player.predicted_correct_self" = paste0(prefix, "player.predicted_correct_self"),
    setNames(paste0(prefix, "player.guess", 1:16), paste0("player.guess", 1:16)),
    setNames(paste0(prefix, "player.row",   1:16), paste0("player.row",   1:16))
  )

  df %>%
    select(participant.code, subject_id, prolific_id, !!!rename_map) %>%
    filter(!is.na(player.machine_name), player.machine_name != "")
}

raw_long <- bind_rows(lapply(1:5, select_round, df = raw))

# ---- 9. Pivot to long format ------------------------------
# Pivot guess1–16 and row1–16 to one row per (subject, treatment, guess number).
df_long <- raw_long %>%
  select(
    participant.code,
    subject_id,
    prolific_id,
    treatment            = player.machine_name,
    round_order          = subsession.round_number,
    original_color       = player.original_color,
    time_machine         = player.time_on_page,
    Notes                = player.notes,
    certainty            = player.certainty,
    difficulty           = player.difficulty,
    difficulty_certainty = player.difficulty_certainty,
    predicted_correct_self = player.predicted_correct_self,
    matches("^player\\.(guess|row)\\d+$")
  ) %>%
  pivot_longer(
    cols          = matches("^player\\.(guess|row)\\d+$"),
    names_to      = c(".value", "Guess_Number"),
    names_pattern = "player\\.(guess|row)(\\d+)"
  ) %>%
  rename(Guess = guess, config_raw = row) %>%
  filter(!is.na(Guess), Guess != "") %>%
  mutate(
    Guess_Number = as.integer(Guess_Number),
    Guess        = as.integer(Guess)
  )

# ---- 10. Parse light configuration -------------------------
# config_raw is stored as a numpy-style array string: "[r b g 0 s]"
# Layout of the 5 values:
#   [0] red_light   – Red light   (1 = ON, 0 = OFF)
#   [1] blue_light  – Blue light  (1 = ON, 0 = OFF)
#   [2] green_light – Green light (1 = ON, 0 = OFF)
#   [3] (always 0)  – unused Others slot
#   [4] sound outcome (not observed by subject; drawn from case data for payment)
#
# Light_Config summarises the three observable lights shown to the subject.
df_long <- df_long %>%
  mutate(
    config_nums  = str_extract_all(config_raw, "\\d+"),
    red_light    = map_int(config_nums, ~ as.integer(.x[1])),
    blue_light   = map_int(config_nums, ~ as.integer(.x[2])),
    green_light  = map_int(config_nums, ~ as.integer(.x[3])),
    Light_Config = paste0("(", red_light, ",", blue_light, ",", green_light, ")")
  ) %>%
  select(-config_nums, -config_raw)

# ---- 11. Fill difficulty and difficulty_certainty across rounds ----
# Collected in the final (5th) round only; propagate to all rounds.
df_long <- df_long %>%
  group_by(participant.code) %>%
  fill(difficulty, difficulty_certainty, .direction = "downup") %>%
  ungroup()

# ---- 12. Account for original_color (light display permutation) ------
# For 3-light experiments, original_color encodes a permutation index (0–5)
# that controls which data column maps to which displayed colour:
#   Perm 0: col0 = Red,   col1 = Blue,  col2 = Green  (standard)
#   Perm 1: col0 = Blue,  col1 = Red,   col2 = Green  (Red ↔ Blue)
#   Perm 2: col0 = Green, col1 = Blue,  col2 = Red    (Red ↔ Green)
#   Perm 3: col0 = Red,   col1 = Green, col2 = Blue   (Blue ↔ Green)
#   Perm 4: col0 = Blue,  col1 = Green, col2 = Red    (rotate)
#   Perm 5: col0 = Green, col1 = Red,   col2 = Blue   (rotate)
#
# RANDOMIZE_COLORS = 0 in Combined_3lights, so all subjects have
# original_color = 0 (standard layout; no remapping needed).
# red_light, blue_light, green_light, Light_Config already reflect the
# correct labelling for all rows in this dataset.
df_long <- df_long %>%
  mutate(original_color = as.integer(original_color))
# All rows: original_color == 0 → standard; red/blue/green already correct.

# ---- 13. Join correct predictions -------------------------
# Join on (treatment, red_light, blue_light, green_light).
# Machine_CorrectP is the same for both green=0 and green=1 rows of a given
# (treatment, red, blue) combination, because green is irrelevant to all rules.
df_long <- df_long %>%
  left_join(
    correct_rules %>% rename(red_light = red, blue_light = blue, green_light = green),
    by = c("treatment", "red_light", "blue_light", "green_light")
  )

# ---- 14. Merge subject- and session-level variables -------
df_final <- df_long %>%
  left_join(subject_text, by = "participant.code") %>%
  left_join(cq_errors,    by = "participant.code") %>%
  left_join(pay_clean,    by = "participant.code") %>%
  left_join(time_total,   by = "participant.code") %>%
  select(-participant.code) %>%
  left_join(demographics, by = "prolific_id")

# ---- 15. Final type coercion and variable ordering --------
df_final <- df_final %>%
  mutate(
    subject_id              = as.integer(subject_id),
    treatment               = factor(treatment,
                                     levels = c("AND", "OR", "INHIBIT", "EITHER", "JOINT")),
    round_order             = as.integer(round_order),
    Guess_Number            = as.integer(Guess_Number),
    Guess                   = as.integer(Guess),
    Machine_CorrectP        = as.integer(Machine_CorrectP),
    red_light               = as.integer(red_light),
    blue_light              = as.integer(blue_light),
    green_light             = as.integer(green_light),
    original_color          = as.integer(original_color),
    time_machine            = as.numeric(time_machine),
    certainty               = as.numeric(certainty),
    difficulty              = as.numeric(difficulty),
    difficulty_certainty    = as.numeric(difficulty_certainty),
    predicted_correct_self  = as.numeric(predicted_correct_self),
    num_wrong               = as.integer(num_wrong),
    final_payment           = as.numeric(final_payment),
    payment_machine         = factor(payment_machine,
                                     levels = c("AND", "OR", "INHIBIT", "EITHER", "JOINT")),
    education               = as.character(education),
    age                     = as.integer(age),
    time_total_min          = as.numeric(time_total_min)
  ) %>%
  select(
    # Subject identifier
    subject_id,
    prolific_id,
    # Treatment info
    treatment, round_order,
    # Trial-level
    Guess_Number, Light_Config,
    red_light, blue_light, green_light,  # green is observable but irrelevant to rules
    Guess,            # subject's prediction (0/1)
    Machine_CorrectP, # optimal prediction given the rule (based on red + blue only)
    # Treatment-level assessments
    time_machine, Notes,
    certainty, difficulty, difficulty_certainty,
    predicted_correct_self,
    # Subject-level text
    STRATEGY, COMMENTS,
    # Subject-level scalars
    num_wrong,
    final_payment,
    payment_machine,
    education,
    age,
    time_total_min,
    # Keep original_color for reference
    original_color
  ) %>%
  arrange(subject_id, round_order, Guess_Number)

# ---- 16. Inspect result -----------------------------------
glimpse(df_final)
cat("\nRows:     ", nrow(df_final), "\n")
cat("Subjects: ", n_distinct(df_final$subject_id), "\n")
cat("Treatments:", paste(levels(df_final$treatment), collapse = ", "), "\n")
cat("Guesses per subject × treatment:", n_distinct(df_final$Guess_Number), "\n")

# ---- 17. Learning indicator (binomial test per subject × treatment) --------
# For each subject × treatment, count how many of the 16 guesses were correct
# (Guess == Machine_CorrectP). Test whether this count is significantly above
# chance (p = 0.5, n = 16) using a one-sided binomial test.
#
# max_correct    : number of correct guesses out of 16
# trueextracted  : 1 if binom.test p-value <= 0.01 AND max_correct > 8
#                  (strictly above chance), indicating the subject learned
#                  the true rule.

learning <- df_final %>%
  group_by(subject_id, treatment) %>%
  summarise(
    max_correct = sum(Guess == Machine_CorrectP),
    .groups = "drop"
  ) %>%
  rowwise() %>%
  mutate(
    trueextracted = as.integer(
      binom.test(max_correct, 16, 0.5)$p.value <= 0.01 & max_correct > 8
    )
  ) %>%
  ungroup()

# Join back into df_final
df_final <- df_final %>%
  left_join(learning, by = c("subject_id", "treatment"))

cat("\nLearning summary (trueextracted) by treatment:\n")
print(
  df_final %>%
    distinct(subject_id, treatment, trueextracted) %>%
    count(treatment, trueextracted)
)

# ---- 18. Payment table ------------------------------------
payment_table <- df_final %>%
  distinct(subject_id, prolific_id, final_payment) %>%
  arrange(subject_id)

cat("\nPayment table:\n")
print(payment_table, n = Inf)

# ---- 19. Comments table -----------------------------------
comments_table <- df_final %>%
  distinct(subject_id, COMMENTS) %>%
  arrange(subject_id)

cat("\nComments table:\n")
for (i in seq_len(nrow(comments_table))) {
  cat(sprintf("\nSubject %d:\n", comments_table$subject_id[i]))
  text <- ifelse(is.na(comments_table$COMMENTS[i]), "(none)", comments_table$COMMENTS[i])
  cat(strwrap(text, width = 80, indent = 2, exdent = 2), sep = "\n")
}

# ---- 20. Average correct guesses per treatment ------------
avg_correct <- df_final %>%
  distinct(subject_id, treatment, max_correct) %>%
  group_by(treatment) %>%
  summarise(avg_correct = mean(max_correct, na.rm = TRUE), .groups = "drop") %>%
  arrange(treatment)

cat("\nAverage correct guesses per treatment (out of 16):\n")
print(avg_correct)

# ---- 21. Per-subject × treatment summary ------------------
subject_treatment_summary <- df_final %>%
  distinct(subject_id, treatment, round_order, time_machine, max_correct) %>%
  arrange(subject_id, round_order)

cat("\nSubject × treatment summary (machine, time, correct guesses):\n")
for (sid in unique(subject_treatment_summary$subject_id)) {
  cat(sprintf("\nSubject %d:\n", sid))
  sub <- subject_treatment_summary %>% filter(subject_id == sid)
  for (j in seq_len(nrow(sub))) {
    cat(sprintf("  Round %d | %-7s | Time: %5.0f s | Correct guesses: %d/16\n",
                sub$round_order[j],
                as.character(sub$treatment[j]),
                sub$time_machine[j],
                sub$max_correct[j]))
  }
}