# ============================================================ # Analysis: Shaded Lights Experiment — 3-Light Dataset # ============================================================ # Sources Structural_3lights.R → DOfile_realexp_3lights.R # Structural_3lights (2).R → DOfile_realexp_3lights (2).R # Stacks them vertically with continuous subject numbering. # Output: Data_3L — one row per subject × treatment × guess # ============================================================ # # ------------------------------------------------------------ # DATA_3L — VARIABLE CODEBOOK # ------------------------------------------------------------ # # UNIT OF OBSERVATION # One row per subject × treatment × guess (trial). # Each subject makes 16 guesses per treatment across 5 treatments # → 80 rows per subject. # # ── IDENTIFIERS ───────────────────────────────────────────── # subject_id : Integer. Sequential anonymous ID. # prolific_id : Character. Anonymised Prolific ID (24-char hex). # session : Factor. "3L" for all rows (single session). # # ── TREATMENT ─────────────────────────────────────────────── # treatment : Character. Logical rule governing the machine. # Five levels: AND, OR, INHIBIT, EITHER, JOINT. # The rule depends only on Red and Blue lights; # Green light is irrelevant to all rules. # round_order : Integer (1–5). Position of this treatment in the # subject's randomised sequence. # # ── TRIAL-LEVEL VARIABLES ─────────────────────────────────── # Guess_Number : Integer (1–16). Index within the treatment. # Light_Config : Character. Observable light state formatted as # "(red,blue,green)"; 8 possible values. # red_light : Integer (0/1). State of the red light. # blue_light : Integer (0/1). State of the blue light. # green_light : Integer (0/1). State of the green light. # Green does NOT affect Machine_CorrectP. # Guess : Integer (0/1). Subject's prediction. # Machine_CorrectP: Integer (0/1). Optimal prediction given the rule # (based on red and blue only). # # ── TREATMENT-LEVEL VARIABLES ─────────────────────────────── # time_machine : Numeric. Seconds on the prediction page. # Notes : Character. Free-text notes. # certainty : Numeric. Self-reported confidence (0–100). # difficulty : Numeric. Self-reported difficulty (1–10). # difficulty_certainty : Numeric. Combined rating (0–100). # predicted_correct_self : Numeric. Subject's own accuracy estimate. # max_correct : Integer (0–16). Correct guesses in treatment. # trueextracted : Integer (0/1). 1 if subject demonstrably learned # the true rule: binom.test(max_correct, 16, 0.5) # p-value ≤ 0.01 AND max_correct > 8. # # ── SUBJECT-LEVEL VARIABLES ───────────────────────────────── # STRATEGY, COMMENTS, num_wrong, final_payment, # payment_machine, education, age, time_total_min, # original_color : same definitions as DOfile_realexp_3lights.R. # # ── RULE-VALUE VARIABLES ──────────────────────────────────── # val_ : Numeric. Proportion of trials in this treatment's # dataset that would predict correctly. # Rules: all named single- and two-variable rules # from Structural_3lights.R (38 strategies). # # ── PER-CONFIG ACCURACY OF THE TRUE RULE ──────────────────── # ACC_000 … ACC_111 : Numeric. Accuracy of the true rule on each of # the 8 Light_Config values within this treatment. # # ── LIGHT-CONFIG FREQUENCIES ──────────────────────────────── # Frequency_000 … Frequency_111 : Numeric. Proportion of trials in # the treatment dataset at each Light_Config. # # ── STRUCTURAL (MIXTURE-MODEL) POSTERIORS ─────────────────── # post_ : Numeric. Bayesian posterior probability that this # subject × treatment is generated by . # From Structural_3lights.R mixture model. # Rule_used : Character. Rule with highest posterior. # posterior : Numeric (2 d.p.). Posterior of Rule_used. # # ── ANALYSIS VARIABLES ────────────────────────────────────── # correlationhigh : Integer (0/1). 1 for treatments where the second- # best single-variable rule provides high competition # with the true rule (difficult treatments): # OR (freq3 — DIFFICULT) # EITHER (freq6 — DIFFICULT) # 0 for easier treatments: # AND (freq2), JOINT (freq9), INHIBIT (freq15) # ------------------------------------------------------------ library(tidyverse) # ---- 1. Source structural scripts (runs DOfile then stratEst estimation) ------ # Each script is sourced into its own environment to prevent name collisions. script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path) #env_3L1 <- new.env(parent = globalenv()) #source(file.path(script_dir, "Structural_modelselection.R"), local = env_3L1) env_3L1 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_3lights.R"), local = env_3L1) #env_3L2 <- new.env(parent = globalenv()) #source(file.path(script_dir, "Structural_modelselection(2).R"), local = env_3L2) env_3L2 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_3lights (2).R"), local = env_3L2) df_3L1 <- env_3L1$df_final_post # session 1 df_3L2 <- env_3L2$df_final_post # session 2 # ---- 2. Add session identifiers df_3L1 <- df_3L1 %>% mutate(session = "3L(1)") df_3L2 <- df_3L2 %>% mutate(session = "3L(2)") # ---- 3. Continue subject numbering across sessions # 3L(1) subjects keep their original IDs (1 … N₁). # 3L(2) subject IDs are offset by the number of subjects in 3L(1) # so that numbering is continuous and never restarts from 1. n_subjects_3L1 <- n_distinct(df_3L1$subject_id) df_3L2 <- df_3L2 %>% mutate(subject_id = subject_id + n_subjects_3L1) # ---- 4. Stack vertically → Data_3L Data_3L <- bind_rows(df_3L1, df_3L2) %>% mutate(session = factor(session, levels = c("3L(1)", "3L(2)"))) %>% arrange(session, subject_id, round_order, Guess_Number) # ---- 5. correlationhigh variable # correlationhigh = 1 for session × treatment combinations labelled DIFFICULT # in __init__.py (single-variable heuristics most competitive with true rule): # 3L(1): OR (freq3), EITHER (freq6) # 3L(2): AND (freq1), INHIBIT (freq12) # correlationhigh = 0 for EASY treatments: # 3L(1): AND (freq2), JOINT (freq9), INHIBIT (freq15) # 3L(2): OR (freq5), EITHER (freq7), JOINT (freq9_A) Data_3L <- Data_3L |> mutate(correlationhigh = as.integer( (session == "3L(1)" & treatment %in% c("AND", "INHIBIT")) | (session == "3L(2)" & treatment %in% c("JOINT", "OR", "EITHER")) )) # ---- 2. Subject_acc: guess-weighted accuracy by light-config frequency ------- # For each guess the contribution is: # ACC_lightconfig if Guess == Machine_CorrectP (subject chose optimally) # 1 - ACC_lightconfig otherwise # Subject_acc is the mean over the 16 guesses within a (subject, treatment). subject_stats <- Data_3L |> mutate( acc_val = case_when( Light_Config == "(0,0,0)" ~ ACC_000, Light_Config == "(0,0,1)" ~ ACC_001, Light_Config == "(0,1,0)" ~ ACC_010, Light_Config == "(0,1,1)" ~ ACC_011, Light_Config == "(1,0,0)" ~ ACC_100, Light_Config == "(1,0,1)" ~ ACC_101, Light_Config == "(1,1,0)" ~ ACC_110, Light_Config == "(1,1,1)" ~ ACC_111 ), guess_acc = if_else(Guess == Machine_CorrectP, acc_val, 1 - acc_val) ) |> group_by(subject_id, treatment, correlationhigh) |> summarise( Subject_acc = mean(guess_acc, na.rm = TRUE), max_correct = first(max_correct), .groups = "drop" ) # ---- 3. CDF plots by statistical properties (correlationhigh): Aggregate ---- # Reset graphics device to clear any invalid state from sourced scripts tryCatch(grDevices::dev.off(), error = function(e) invisible(NULL)) corr_labels <- c("0" = "Low correlation", "1" = "High correlation") # CDF of max_correct p_max <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> ggplot(aes(x = max_correct, colour = correlationhigh)) + stat_ecdf(linewidth = 0.8) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + labs( x = "Number of correct guesses (max_correct)", y = "Cumulative proportion", colour = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_max) # ---- 6. CDF plots by statistical properties (correlationhigh): By treatment -- p_max_bytreat <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> ggplot(aes(x = max_correct, colour = correlationhigh)) + stat_ecdf(linewidth = 0.8) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + facet_wrap(~ treatment, nrow = 1) + labs( x = "Number of correct guesses (max_correct)", y = "Cumulative proportion", colour = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_max_bytreat) # ---- 4. Bar plots: fraction of subjects per max_correct bin ----------------- bar_data <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(correlationhigh, max_correct) |> summarise(n = n(), .groups = "drop") |> group_by(correlationhigh) |> mutate(frac = n / sum(n)) |> ungroup() means_data <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(correlationhigh) |> summarise(mean_mc = mean(max_correct, na.rm = TRUE), .groups = "drop") p_bar <- ggplot(bar_data, aes(x = max_correct, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", width = 0.7, alpha = 0.85) + geom_vline(data = means_data, aes(xintercept = mean_mc, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:16) + labs( x = "Number of correct guesses (max_correct)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_bar) # 10 panels: one per treatment × correlationhigh bar_data_treat <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(treatment, correlationhigh, max_correct) |> summarise(n = n(), .groups = "drop") |> group_by(treatment, correlationhigh) |> mutate(frac = n / sum(n)) |> ungroup() means_data_treat <- subject_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(treatment, correlationhigh) |> summarise(mean_mc = mean(max_correct, na.rm = TRUE), .groups = "drop") p_bar_treat <- ggplot(bar_data_treat, aes(x = max_correct, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", width = 0.8, alpha = 0.85) + geom_vline(data = means_data_treat, aes(xintercept = mean_mc, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels, guide = "none") + scale_x_continuous(breaks = seq(0, 16, 4)) + facet_grid(correlationhigh ~ treatment, labeller = labeller(correlationhigh = corr_labels)) + labs( x = "Number of correct guesses (max_correct)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "none") print(p_bar_treat) # ---- 8. Bar plots: fraction of subjects by trueextracted × correlationhigh -- te_stats <- Data_3L |> distinct(subject_id, treatment, correlationhigh, trueextracted) bar_data_te <- te_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(correlationhigh, trueextracted) |> summarise(n = n(), .groups = "drop") |> group_by(correlationhigh) |> mutate(frac = n / sum(n)) |> ungroup() means_data_te <- te_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(correlationhigh) |> summarise(mean_te = mean(trueextracted, na.rm = TRUE), .groups = "drop") p_bar_te <- ggplot(bar_data_te, aes(x = trueextracted, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", width = 0.5, alpha = 0.85) + geom_vline(data = means_data_te, aes(xintercept = mean_te, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:1, labels = c("Not extracted", "Extracted")) + labs( x = "True rule extracted (trueextracted)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_bar_te) # 10 panels: one per treatment × correlationhigh bar_data_te_treat <- te_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(treatment, correlationhigh, trueextracted) |> summarise(n = n(), .groups = "drop") |> group_by(treatment, correlationhigh) |> mutate(frac = n / sum(n)) |> ungroup() means_data_te_treat <- te_stats |> mutate(correlationhigh = factor(correlationhigh)) |> group_by(treatment, correlationhigh) |> summarise(mean_te = mean(trueextracted, na.rm = TRUE), .groups = "drop") p_bar_te_treat <- ggplot(bar_data_te_treat, aes(x = trueextracted, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", width = 0.5, alpha = 0.85) + geom_vline(data = means_data_te_treat, aes(xintercept = mean_te, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels) + scale_colour_manual(values = c("0" = "#2166ac", "1" = "#d6604d"), labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:1, labels = c("Not extracted", "Extracted")) + facet_grid(correlationhigh ~ treatment, labeller = labeller(correlationhigh = corr_labels)) + labs( x = "True rule extracted (trueextracted)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "none") print(p_bar_te_treat) # ---- 5. Mixed-effects regression: trueextracted ~ correlationhigh ----------- library(lme4) lmm_maxcorrect <- lmer( trueextracted ~ correlationhigh + (1 | subject_id), data = te_stats, REML = TRUE ) summary(lmm_maxcorrect) # Variance decomposition vc <- as.data.frame(VarCorr(lmm_maxcorrect)) total_var <- sum(vc$vcov) cat("\n--- Variance decomposition ---\n") cat(sprintf("Subject intercept variance : %.3f (%.1f%%)\n", vc$vcov[vc$grp == "subject_id"], 100 * vc$vcov[vc$grp == "subject_id"] / total_var)) cat(sprintf("Residual variance : %.3f (%.1f%%)\n", vc$vcov[vc$grp == "Residual"], 100 * vc$vcov[vc$grp == "Residual"] / total_var)) cat(sprintf("Fixed effect (correlationhigh): %.3f (SE = %.3f)\n", fixef(lmm_maxcorrect)["correlationhigh"], sqrt(diag(vcov(lmm_maxcorrect)))["correlationhigh"])) # ---- 6. Suboptimal extraction and randomizing ------------------------------ # 10a. All 256 deterministic rules mapped to Light_Config values. # expand.grid with p000 as LSB: row i+1 corresponds to integer i, # where bit k-1 gives the prediction for state k. rule_table_3L <- expand.grid( p000 = 0:1, p001 = 0:1, p010 = 0:1, p011 = 0:1, p100 = 0:1, p101 = 0:1, p110 = 0:1, p111 = 0:1 ) |> as_tibble() |> mutate(rule_id = row_number()) |> pivot_longer(-rule_id, names_to = "config_key", values_to = "rule_pred") |> mutate(Light_Config = case_when( config_key == "p000" ~ "(0,0,0)", config_key == "p001" ~ "(0,0,1)", config_key == "p010" ~ "(0,1,0)", config_key == "p011" ~ "(0,1,1)", config_key == "p100" ~ "(1,0,0)", config_key == "p101" ~ "(1,0,1)", config_key == "p110" ~ "(1,1,0)", config_key == "p111" ~ "(1,1,1)" )) |> select(-config_key) # 10b. Find which rule_id corresponds to the true rule for each treatment true_rule_ids <- Data_3L |> distinct(treatment, Light_Config, Machine_CorrectP) |> left_join(rule_table_3L, by = "Light_Config", relationship = "many-to-many") |> group_by(treatment, rule_id) |> summarise(is_true = all(Machine_CorrectP == rule_pred), .groups = "drop") |> filter(is_true) |> select(treatment, true_rule_id = rule_id) # 10c. Count matches against all 255 non-true rules per subject × treatment. # suboptimalextracted = 1 if any non-true rule passes binom.test # (p <= 0.01, count > 8; 16 guesses, strictly above chance). suboptimalextracted_df <- Data_3L |> select(subject_id, treatment, correlationhigh, Light_Config, Guess) |> left_join(rule_table_3L, by = "Light_Config", relationship = "many-to-many") |> left_join(true_rule_ids, by = "treatment") |> filter(rule_id != true_rule_id) |> group_by(subject_id, treatment, correlationhigh, rule_id) |> summarise(n_match = sum(Guess == rule_pred), .groups = "drop") |> rowwise() |> mutate(passes = binom.test(n_match, 16, 0.5)$p.value <= 0.01 & n_match > 8) |> ungroup() |> group_by(subject_id, treatment, correlationhigh) |> summarise(suboptimalextracted = as.integer(any(passes)), .groups = "drop") # 10d. Combine with trueextracted and define randomizing extraction_stats <- Data_3L |> distinct(subject_id, treatment, correlationhigh, trueextracted) |> left_join(suboptimalextracted_df, by = c("subject_id", "treatment", "correlationhigh")) |> mutate(randomizing = as.integer(trueextracted == 0 & suboptimalextracted == 0)) # 10e. Frequency table by correlationhigh tbl_corr <- extraction_stats |> group_by(correlationhigh) |> summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) |> mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1) ) print(tbl_corr) # 10f. Frequency table by treatment tbl_treat <- extraction_stats |> group_by(treatment) |> summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) |> mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1) ) print(tbl_treat) # 10g. Frequency table by treatment × correlationhigh tbl_treat_corr <- extraction_stats |> group_by(treatment, correlationhigh) |> summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) |> mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1), correlationhigh = factor(correlationhigh, levels = c(0, 1), labels = c("Low", "High")) ) |> arrange(treatment, correlationhigh) print(tbl_treat_corr) # 10g (wide). Same table pivoted: one row per treatment, % columns by Low/High tbl_treat_corr_wide <- tbl_treat_corr |> select(treatment, correlationhigh, N, true_ext_pct, subopt_pct, rand_pct) |> pivot_wider( names_from = correlationhigh, values_from = c(N, true_ext_pct, subopt_pct, rand_pct), names_glue = "{.value}_{correlationhigh}" ) |> select(treatment, N_Low, N_High, true_ext_pct_Low, true_ext_pct_High, subopt_pct_Low, subopt_pct_High, rand_pct_Low, rand_pct_High) print(tbl_treat_corr_wide) # ---- 7. Rule_used: posterior table and bar chart --------------------------- rule_obs <- Data_3L |> distinct(subject_id, treatment, correlationhigh, Rule_used) |> mutate( Rule_used = if_else(str_starts(Rule_used, "s_"), "Other", Rule_used), correlationhigh = factor(correlationhigh, levels = c(0, 1), labels = c("Low", "High")) ) # --- 11a. Proportion table (treatment × correlationhigh × Rule_used) --- tbl_rule <- rule_obs |> group_by(treatment, correlationhigh, Rule_used) |> summarise(n = n(), .groups = "drop") |> group_by(treatment, correlationhigh) |> mutate(pct = round(n / sum(n) * 100, 1)) |> ungroup() |> arrange(treatment, correlationhigh, desc(pct)) print(tbl_rule) # Wide version: one row per treatment × Rule_used, columns = Low / High % tbl_rule_wide <- tbl_rule |> select(treatment, correlationhigh, Rule_used, pct) |> pivot_wider(names_from = correlationhigh, values_from = pct, values_fill = 0) |> arrange(treatment, desc(Low + High)) print(tbl_rule_wide) # --- 11b. Bar chart: proportion of Rule_used by treatment × correlationhigh -- rule_plot_data <- tbl_rule |> mutate(treatment = factor(treatment, levels = c("AND", "OR", "INHIBIT", "EITHER", "JOINT"))) p_rule <- ggplot(rule_plot_data, aes(x = reorder(Rule_used, -pct), y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Low" = "#2166ac", "High" = "#d6604d")) + facet_wrap(~ treatment, nrow = 1) + labs( x = "Rule used", y = "% of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 7), legend.position = "bottom" ) print(p_rule) # ---- 8. Rule_used: bar chart — rules with ≥ 5% share only ------------------ # Keep only rules that reach 5% in at least one treatment × correlationhigh cell rules_above5 <- tbl_rule |> filter(pct >= 5) |> distinct(Rule_used) rule_plot_data_top <- tbl_rule |> semi_join(rules_above5, by = "Rule_used") |> mutate(treatment = factor(treatment, levels = c("AND", "OR", "INHIBIT", "EITHER", "JOINT"))) p_rule_top <- ggplot(rule_plot_data_top, aes(x = reorder(Rule_used, -pct), y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Low" = "#2166ac", "High" = "#d6604d")) + facet_wrap(~ treatment, nrow = 1, scales = "free_x") + labs( x = "Rule used", y = "% of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 7), legend.position = "bottom" ) print(p_rule_top) # ---- 9. Prior shares from structural estimation -------------------------------- # ESTIMATION_RESULTS (from Structural_allrules.R) contains one population-level # share per strategy per treatment — these are the MLE priors, i.e. the mixing # weights before any subject-level data update the posteriors. prior_long <- bind_rows( env_3L1$ESTIMATION_RESULTS |> mutate(session = "3L(1)"), env_3L2$ESTIMATION_RESULTS |> mutate(session = "3L(2)") ) |> select(-loglike, -num_par) |> pivot_longer(c(-treatment, -session), names_to = "Rule", values_to = "share") |> filter(!is.na(share), share > 0) |> mutate( Rule = if_else(str_starts(Rule, "s_"), "Other", Rule), correlationhigh = factor( as.integer( (session == "3L(1)" & treatment %in% c("AND", "INHIBIT")) | (session == "3L(2)" & treatment %in% c("JOINT", "OR", "EITHER")) ), levels = c(0, 1), labels = c("Low", "High") ), treatment = factor(treatment, levels = c("AND", "OR", "INHIBIT", "EITHER", "JOINT")) ) |> group_by(treatment, session, correlationhigh, Rule) |> summarise(share = sum(share), .groups = "drop") |> mutate(pct = round(share * 100, 1)) # --- 9a. Proportion table (treatment × session × Rule, sorted by share) --- tbl_prior <- prior_long |> arrange(treatment, session, desc(pct)) print(tbl_prior) # Wide version: one row per treatment × session × Rule, columns = Low / High % tbl_prior_wide <- tbl_prior |> select(treatment, session, correlationhigh, Rule, pct) |> pivot_wider(names_from = correlationhigh, values_from = pct, values_fill = 0) |> arrange(treatment, session, desc(Low + High)) print(tbl_prior_wide) # --- 9b. Bar chart: all rules with prior share ≥ 5% in at least one treatment --- rules_above5_pr <- prior_long |> filter(pct >= 5) |> distinct(Rule) p_prior_top <- prior_long |> semi_join(rules_above5_pr, by = "Rule") |> ggplot(aes(x = reorder(Rule, -pct), y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Low" = "#2166ac", "High" = "#d6604d")) + facet_wrap(~ treatment, nrow = 1, scales = "free_x") + labs( x = "Rule (prior)", y = "Prior share (%)", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 7), legend.position = "bottom" ) print(p_prior_top) # ============================================================ # 10. Strategy cost (kappa) estimation # ============================================================ # For each (session x treatment) cell D with k strategies (rho > 0), # set up the k(k-1)/2 pairwise equations: # v^D(pi_i) - v^D(pi_j) = rho^D(pi_i)*kappa(pi_i) - rho^D(pi_j)*kappa(pi_j) # This is the linear system A*kappa = b, rank k-1. # The unique minimum-norm solution (SVD pseudoinverse) requires no # normalisation: it sets sum(kappa_i / rho_i) = 0 implicitly. # # v^D(pi) = val_ from Data_3L (session-specific accuracy). # rho^D(pi) = population share from ESTIMATION_RESULTS. # ============================================================ # ---- 10a. v^D(pi) from val_ columns in Data_3L ------------------------------ # val_ is constant within (session x treatment); take first value per cell. val_long_10 <- Data_3L |> mutate(session = as.character(session)) |> group_by(session, treatment) |> summarise(across(starts_with("val_"), first), .groups = "drop") |> pivot_longer(starts_with("val_"), names_to = "strategy_val", values_to = "v") |> mutate(strategy = str_remove(strategy_val, "^val_")) |> select(session, treatment, strategy, v) cat(sprintf("val_long_10: %d rows | sessions: %s\n", nrow(val_long_10), paste(sort(unique(val_long_10$session)), collapse = ", "))) # ---- 10b. Structural shares rho^D(pi) (both sessions, no always_p) ---------- shares_long_10 <- bind_rows( env_3L1$ESTIMATION_RESULTS |> mutate(session = "3L(1)"), env_3L2$ESTIMATION_RESULTS |> mutate(session = "3L(2)") ) |> select(-loglike, -num_par) |> pivot_longer(-c(treatment, session), names_to = "strategy", values_to = "rho") |> filter(!is.na(rho), rho > 0, strategy != "always_p") cat(sprintf("shares_long_10: %d rows | sessions: %s\n", nrow(shares_long_10), paste(sort(unique(shares_long_10$session)), collapse = ", "))) # ---- 10c. Join v and rho per (session x treatment x strategy) --------------- cost_data_v_10 <- shares_long_10 |> inner_join(val_long_10, by = c("session", "treatment", "strategy")) |> filter(!is.na(v)) cat(sprintf("cost_data_v_10: %d rows | sessions: %s\n", nrow(cost_data_v_10), paste(sort(unique(cost_data_v_10$session)), collapse = ", "))) # ---- 10d. Solver: minimum-norm solution via SVD pseudoinverse --------------- # For k strategies in one D cell, builds the k(k-1)/2 x k design matrix A # and RHS b, then returns kappa = A^+ b. solve_kappa_cell <- function(strats, v_vec, rho_vec) { k <- length(strats) if (k == 1L) return(tibble(strategy = strats, kappa = NA_real_)) pairs <- combn(k, 2) np <- ncol(pairs) A <- matrix(0, np, k) b <- numeric(np) for (p in seq_len(np)) { i <- pairs[1, p]; j <- pairs[2, p] A[p, i] <- rho_vec[i] A[p, j] <- -rho_vec[j] b[p] <- v_vec[i] - v_vec[j] } sv <- svd(A) tol <- max(dim(A)) * max(sv$d) * .Machine$double.eps nz <- sv$d > tol d_inv <- 1 / sv$d[nz] A_plus <- sv$v[, nz, drop = FALSE] %*% (d_inv * t(sv$u[, nz, drop = FALSE])) tibble(strategy = strats, kappa = as.numeric(A_plus %*% b)) } # ---- 10e. Solve per (session x treatment) cell ------------------------------ kappa_solved_10 <- cost_data_v_10 |> group_by(session, treatment) |> summarise( kappa_df = list(solve_kappa_cell(strategy, v, rho)), .groups = "drop" ) |> unnest(kappa_df) cost_data_10 <- cost_data_v_10 |> left_join(kappa_solved_10, by = c("session", "treatment", "strategy")) |> mutate( correlationhigh = factor( as.integer( (session == "3L(1)" & treatment %in% c("AND", "INHIBIT")) | (session == "3L(2)" & treatment %in% c("JOINT", "OR", "EITHER")) ), levels = c(0L, 1L), labels = c("Low", "High") ) ) # ---- 10f. Results table ----------------------------------------------------- cat(sprintf( "\nSection 10 -- %d cells | sessions: %s | %d strategies\n", nrow(cost_data_10), paste(sort(unique(cost_data_10$session)), collapse = ", "), n_distinct(cost_data_10$strategy) )) cat("\n=== Section 10: Strategy cost table (min-norm pairwise system) ===\n") print( cost_data_10 |> select(session, treatment, correlationhigh, strategy, rho, v, kappa) |> arrange(treatment, session, desc(rho)) |> mutate(across(c(rho, v, kappa), ~ round(.x, 4))) ) # ---- 10g. Scatter: estimated kappa grouped by strategy family ---- # Strategies are pooled into logical-operator families so that, e.g., # AND_rb / AND_rg / AND_bg / AND3 all share the same x position. # The red dot + error bar shows mean ± SE across all treatment × session # observations within the family (after outlier removal). scatter_data_10g <- cost_data_10 |> filter(!is.na(kappa)) |> mutate( strategy_group = case_when( str_starts(strategy, "s_") ~ "Other", strategy == "random" ~ "Random", strategy %in% c("always_0", "always_1", "red", "not_red", "blue", "not_blue", "green", "not_green") ~ "1-variable", str_starts(strategy, "not_INHIBIT") ~ "not-INHIBIT", str_detect(strategy, "^AND") ~ "AND", str_detect(strategy, "^OR") ~ "OR", str_detect(strategy, "^EITHER") ~ "EITHER", str_detect(strategy, "^JOINT") ~ "JOINT", str_detect(strategy, "^INHIBIT") ~ "INHIBIT", str_detect(strategy, "^NOR") ~ "NOR", str_detect(strategy, "^NAND") ~ "NAND", TRUE ~ strategy ) ) |> group_by(strategy_group) |> filter({ iqr <- IQR(kappa) med <- median(kappa) kappa >= med - 1.5 * iqr & kappa <= med + 1.5 * iqr }) |> ungroup() scatter_summary_10g <- scatter_data_10g |> group_by(strategy_group) |> summarise( mean_kappa = mean(kappa), se_kappa = sd(kappa) / sqrt(n()), .groups = "drop" ) ggplot(scatter_data_10g, aes(x = strategy_group, y = kappa)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") + geom_jitter(aes(colour = treatment, shape = session), width = 0.15, height = 0, size = 2.5, alpha = 0.85) + geom_point(data = scatter_summary_10g, aes(x = strategy_group, y = mean_kappa), colour = "red", size = 3.5, inherit.aes = FALSE) + geom_errorbar(data = scatter_summary_10g, aes(x = strategy_group, ymin = mean_kappa - se_kappa, ymax = mean_kappa + se_kappa), colour = "red", width = 0.25, linewidth = 0.9, inherit.aes = FALSE) + scale_colour_brewer(palette = "Set1") + labs( x = "Strategy family", y = expression(hat(kappa)), colour = "Treatment", shape = "Session", title = "Estimated strategy cost by strategy family" ) + theme_bw() + theme(axis.text.x = element_text(angle = 40, hjust = 1))