# ============================================================ # Pilot Analysis: Shaded Lights Experiment — Sessions 2L(1)–2L(4) # ============================================================ # Sources Structural_modelselection_2lights1.R → session "2L(1)" # Structural_modelselection_2lights.R → session "2L(2)" # Structural_modelselection_2lights3.R → session "2L(3)" # Structural_modelselection_2lights4.R → session "2L(4)" # Stacks them vertically with continuous subject numbering. # AND treatment excluded from sessions 2L(2) and 2L(4) by default; # set INCLUDE_AND_2L24 <- TRUE (below) to include them. # Output: Data_2L # ============================================================ # # ------------------------------------------------------------ # DATA_2L — VARIABLE CODEBOOK # ------------------------------------------------------------ # # UNIT OF OBSERVATION # One row per subject × treatment × guess (trial). # Each subject makes 12 guesses per treatment across 5 treatments # → 60 rows per subject (72 if INCLUDE_AND_2L24 = TRUE, adding AND # from 2L(2) [→ Easy (LA)] and 2L(4) [→ Difficult]). # Sessions are stacked vertically. # # ── IDENTIFIERS ───────────────────────────────────────────── # subject_id : Integer. Re-numbered continuously across all # sessions: 2L(1) IDs unchanged; 2L(2) offset by # N_2L(1); 2L(3) offset by N_2L(1) + N_2L(2). # prolific_id : Character. Anonymised Prolific ID (24-char hex). # session : Factor. "2L(1)", "2L(2)", "2L(3)", "2L(4)". # # ── TREATMENT ─────────────────────────────────────────────── # treatment : Character. The logical rule for this treatment. # Seven possible values across sessions: # AND – sound iff red ON AND blue ON # [all sessions; 2L(2)/2L(4) only # included if INCLUDE_AND_2L24=TRUE] # OR – sound iff red ON OR blue ON # INHIBIT – sound iff red ON AND blue OFF # [sessions 2L(1) and 2L(2) only] # EITHER – sound iff exactly one light ON # JOINT – sound iff both lights share state # [sessions 2L(1) and 2L(2) only] # ALONE_easy – sound iff red ON (any blue) # [session 2L(3) only] # ALONE_difficult– sound iff blue ON (any red) # [session 2L(3) only] # round_order : Integer (1–5). Position in which the subject # encountered this treatment. # # ── TRIAL-LEVEL VARIABLES ─────────────────────────────────── # Guess_Number : Integer (1–12). # Light_Config : Character. "(red, blue)" with 1 = ON, 0 = OFF. # red_light : Integer (0/1). # blue_light : Integer (0/1). # Guess : Integer (0/1). Subject's prediction. # Machine_CorrectP: Integer (0/1). Optimal prediction given the rule. # # ── TREATMENT-LEVEL VARIABLES ─────────────────────────────── # time_machine, Notes, certainty, difficulty, difficulty_certainty, # predicted_correct_self, max_correct, trueextracted (see DOfile) # # ── SUBJECT-LEVEL VARIABLES ───────────────────────────────── # STRATEGY, COMMENTS, num_wrong, final_payment, payment_machine, # education, age, time_total_min, original_color (see DOfile) # # ── RULE-VALUE VARIABLES ──────────────────────────────────── # val_ : Proportion of trials in this treatment's dataset that # correctly predicts (16 deterministic rules). # # ── PER-CONFIG ACCURACY OF THE TRUE RULE ──────────────────── # ACC_00, ACC_01, ACC_10, ACC_11 # # ── LIGHT-CONFIG FREQUENCIES ──────────────────────────────── # Frequency_00, Frequency_01, Frequency_10, Frequency_11 # # ── STRUCTURAL POSTERIORS ─────────────────────────────────── # post_ : Posterior probability of each strategy. # Rule_used : Strategy with highest posterior. # posterior : Posterior of Rule_used (2 d.p.). # # ── ANALYSIS VARIABLES ────────────────────────────────────── # correlationhigh : Integer with 3 values: # 0 – Difficult (sessions 2L(1) & 2L(2)): # 2L(2) OR, INHIBIT, JOINT | 2L(1) AND, EITHER # 1 – Easy (LA) (sessions 2L(1) & 2L(2)): # 2L(2) EITHER | 2L(1) OR, INHIBIT, JOINT # 2 – Easy (HA) / Session 2L(3) (ALONE_easy, ALONE_difficult, # AND, OR, EITHER with new frequency distributions) # ------------------------------------------------------------ library(tidyverse) library(readr) # ============================================================ # ANALYSIS OPTIONS # ============================================================ # TRUE → include AND rows from 2L(2) [→ Easy (LA)] and 2L(4) [→ Difficult] # FALSE → exclude them (original behaviour) INCLUDE_AND_2L24 <- FALSE # ============================================================ # ---- 1. Source all three structural estimation scripts -------------------- script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path) env_2L1 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_modelselection_2lights1.R"), local = env_2L1) env_2L2 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_modelselection_2lights.R"), local = env_2L2) env_2L3 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_modelselection_2lights3.R"), local = env_2L3) env_2L4 <- new.env(parent = globalenv()) source(file.path(script_dir, "Structural_modelselection_2lights4.R"), local = env_2L4) df_2L1 <- env_2L1$df_final_post # session 1 df_2L2 <- env_2L2$df_final_post # session 2 df_2L3 <- env_2L3$df_final_post # session 3 df_2L4 <- env_2L4$df_final_post # session 4 # ---- 2. Add session identifier df_2L1 <- df_2L1 %>% mutate(session = "2L(1)") df_2L2 <- df_2L2 %>% mutate(session = "2L(2)") df_2L3 <- df_2L3 %>% mutate(session = "2L(3)") df_2L4 <- df_2L4 %>% mutate(session = "2L(4)") # ---- 3. Continue subject numbering across all three sessions n_subjects_2L1 <- n_distinct(df_2L1$subject_id) n_subjects_2L2 <- n_distinct(df_2L2$subject_id) n_subjects_2L3 <- n_distinct(df_2L3$subject_id) df_2L2 <- df_2L2 %>% mutate(subject_id = subject_id + n_subjects_2L1) df_2L3 <- df_2L3 %>% mutate(subject_id = subject_id + n_subjects_2L1 + n_subjects_2L2) df_2L4 <- df_2L4 %>% mutate(subject_id = subject_id + n_subjects_2L1 + n_subjects_2L2 + n_subjects_2L3) # ---- 4. Stack vertically → Data_2L, optionally removing AND from 2L(2)/2L(4) Data_2L <- bind_rows(df_2L1, df_2L2, df_2L3, df_2L4) %>% { if (!INCLUDE_AND_2L24) filter(., !(treatment == "AND" & session %in% c("2L(2)", "2L(4)"))) else . } %>% mutate(session = factor(session, levels = c("2L(1)", "2L(2)", "2L(3)", "2L(4)"))) %>% arrange(session, subject_id, round_order, Guess_Number) # ---- 5. Correlationhigh variable # 0 = Difficult (sessions 2L(1) & 2L(2), easier frequency distributions) # 1 = Easy (LA) (sessions 2L(1) & 2L(2), harder frequency distributions) # 2 = Easy (HA) (all treatments in session 2L(3)) Data_2L <- Data_2L %>% mutate(correlationhigh = case_when( treatment == "ALONE_difficult" & session == "2L(3)" ~ 0L, session == "2L(3)" ~ 2L, treatment == "AND" & session == "2L(2)" ~ 1L, treatment == "AND" & session == "2L(1)" ~ 0L, treatment == "EITHER" & session == "2L(2)" ~ 1L, treatment == "OR" & session == "2L(1)" ~ 1L, treatment == "INHIBIT"& session == "2L(1)" ~ 2L, treatment == "JOINT" & session == "2L(1)" ~ 2L, treatment == "AND" & session == "2L(4)" ~ 0L, treatment == "EITHER" & session == "2L(4)" ~ 2L, treatment == "AND" & session == "2L(1)" ~ 3L, session == "2L(4)" ~ 1L, TRUE ~ 0L )) # ---- Shared labels and colours (used throughout) corr_labels <- c("0" = "Difficult", "1" = "Easy (LA)", "2" = "Easy (HA)") corr_colours <- c("0" = "#2166ac", # blue "1" = "#d6604d", # red "2" = "#4dac26") # green corr_fills <- c("0" = "#2166ac", "1" = "#d6604d", "2" = "#4dac26") # Ordered factor label map also used in tables corr_levels_named <- c("Difficult", "Easy (LA)", "Easy (HA)") # For plots only: collapse ALONE_easy and ALONE_difficult into one "ALONE" panel to_plot_treatment <- function(x) { if_else(x %in% c("ALONE_easy", "ALONE_difficult"), "ALONE", as.character(x)) } plot_treatment_levels <- c("AND", "OR", "INHIBIT", "EITHER", "JOINT", "ALONE") # ---- 6. Subject_acc: guess-weighted accuracy by light-config frequency subject_stats <- Data_2L %>% mutate( acc_val = case_when( Light_Config == "(0,0)" ~ ACC_00, Light_Config == "(0,1)" ~ ACC_01, Light_Config == "(1,0)" ~ ACC_10, Light_Config == "(1,1)" ~ ACC_11 ), guess_acc = if_else(Guess == Machine_CorrectP, acc_val, 1 - acc_val) ) %>% group_by(subject_id, treatment, correlationhigh) %>% summarise( Subject_acc = mean(guess_acc, na.rm = TRUE), max_correct = first(max_correct), .groups = "drop" ) # ---- 2. CDF plots by correlationhigh: Aggregate (Not really helpful: substitution) --------- p_max <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh)) %>% ggplot(aes(x = max_correct, colour = correlationhigh)) + stat_ecdf(linewidth = 0.8) + scale_colour_manual(values = corr_colours, labels = corr_labels) + labs( x = "Number of correct guesses (max_correct)", y = "Cumulative proportion", colour = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_max) # ---- 3. CDF plots by correlationhigh: By treatment (Not really helpful: substitution) ------------------------ p_max_bytreat <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh), treatment_plot = factor(to_plot_treatment(treatment), levels = plot_treatment_levels)) %>% ggplot(aes(x = max_correct, colour = correlationhigh)) + stat_ecdf(linewidth = 0.8) + scale_colour_manual(values = corr_colours, labels = corr_labels) + facet_wrap(~ treatment_plot, nrow = 1) + labs( x = "Number of correct guesses (max_correct)", y = "Cumulative proportion", colour = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_max_bytreat) # ---- 4. Bar plots: fraction of subjects per max_correct bin (IMPORTANT) --------------- bar_data <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh)) %>% group_by(correlationhigh, max_correct) %>% summarise(n = n(), .groups = "drop") %>% group_by(correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() means_data <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh)) %>% group_by(correlationhigh) %>% summarise(mean_mc = mean(max_correct, na.rm = TRUE), .groups = "drop") p_bar <- ggplot(bar_data, aes(x = max_correct, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", width = 0.7, alpha = 0.85) + geom_vline(data = means_data, aes(xintercept = mean_mc, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = corr_fills, labels = corr_labels) + scale_colour_manual(values = corr_colours, labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:12) + labs( x = "Number of correct guesses (max_correct)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_bar) # By treatment bar_data_treat <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh), treatment_plot = factor(to_plot_treatment(treatment), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, max_correct) %>% summarise(n = n(), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() means_data_treat <- subject_stats %>% mutate(correlationhigh = factor(correlationhigh), treatment_plot = factor(to_plot_treatment(treatment), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh) %>% summarise(mean_mc = mean(max_correct, na.rm = TRUE), .groups = "drop") p_bar_treat <- ggplot(bar_data_treat, aes(x = max_correct, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", width = 0.8, alpha = 0.85) + geom_vline(data = means_data_treat, aes(xintercept = mean_mc, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = corr_fills, labels = corr_labels) + scale_colour_manual(values = corr_colours, labels = corr_labels, guide = "none") + scale_x_continuous(breaks = seq(0, 12, 3)) + facet_grid(correlationhigh ~ treatment_plot, labeller = labeller(correlationhigh = corr_labels)) + labs( x = "Number of correct guesses (max_correct)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "none") print(p_bar_treat) # ---- 5. Bar plots: trueextracted by correlationhigh (IMPORTANT) ---------------------- te_stats <- Data_2L %>% distinct(subject_id, treatment, correlationhigh, trueextracted) bar_data_te <- te_stats %>% mutate(correlationhigh = factor(correlationhigh)) %>% group_by(correlationhigh, trueextracted) %>% summarise(n = n(), .groups = "drop") %>% group_by(correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() means_data_te <- te_stats %>% mutate(correlationhigh = factor(correlationhigh)) %>% group_by(correlationhigh) %>% summarise(mean_te = mean(trueextracted, na.rm = TRUE), .groups = "drop") p_bar_te <- ggplot(bar_data_te, aes(x = trueextracted, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", width = 0.5, alpha = 0.85) + geom_vline(data = means_data_te, aes(xintercept = mean_te, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = corr_fills, labels = corr_labels) + scale_colour_manual(values = corr_colours, labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:1, labels = c("Not extracted", "Extracted")) + labs( x = "True rule extracted (trueextracted)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "bottom") print(p_bar_te) # By treatment bar_data_te_treat <- te_stats %>% mutate(correlationhigh = factor(correlationhigh), treatment_plot = factor(to_plot_treatment(treatment), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, trueextracted) %>% summarise(n = n(), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() means_data_te_treat <- te_stats %>% mutate(correlationhigh = factor(correlationhigh), treatment_plot = factor(to_plot_treatment(treatment), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh) %>% summarise(mean_te = mean(trueextracted, na.rm = TRUE), .groups = "drop") p_bar_te_treat <- ggplot(bar_data_te_treat, aes(x = trueextracted, y = frac, fill = correlationhigh)) + geom_bar(stat = "identity", width = 0.5, alpha = 0.85) + geom_vline(data = means_data_te_treat, aes(xintercept = mean_te, colour = correlationhigh), linewidth = 0.9, linetype = "dashed") + scale_fill_manual(values = corr_fills, labels = corr_labels) + scale_colour_manual(values = corr_colours, labels = corr_labels, guide = "none") + scale_x_continuous(breaks = 0:1, labels = c("Not extracted", "Extracted")) + facet_grid(correlationhigh ~ treatment_plot, labeller = labeller(correlationhigh = corr_labels)) + labs( x = "True rule extracted (trueextracted)", y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme(legend.position = "none") print(p_bar_te_treat) # ---- 6. Suboptimal extraction and randomizing (IMPORTANT) ---------------------------- rule_table <- expand.grid(p00 = 0:1, p01 = 0:1, p10 = 0:1, p11 = 0:1) %>% as_tibble() %>% mutate(rule_id = row_number()) %>% pivot_longer(-rule_id, names_to = "config_key", values_to = "rule_pred") %>% mutate(Light_Config = case_when( config_key == "p00" ~ "(0,0)", config_key == "p01" ~ "(0,1)", config_key == "p10" ~ "(1,0)", config_key == "p11" ~ "(1,1)" )) %>% select(-config_key) true_rule_ids <- Data_2L %>% distinct(treatment, Light_Config, Machine_CorrectP) %>% left_join(rule_table, by = "Light_Config", relationship = "many-to-many") %>% group_by(treatment, rule_id) %>% summarise(is_true = all(Machine_CorrectP == rule_pred), .groups = "drop") %>% filter(is_true) %>% select(treatment, true_rule_id = rule_id) suboptimalextracted_df <- Data_2L %>% select(subject_id, treatment, correlationhigh, Light_Config, Guess) %>% left_join(rule_table, by = "Light_Config", relationship = "many-to-many") %>% left_join(true_rule_ids, by = "treatment") %>% filter(rule_id != true_rule_id) %>% group_by(subject_id, treatment, correlationhigh, rule_id) %>% summarise(n_match = sum(Guess == rule_pred), .groups = "drop") %>% rowwise() %>% mutate(passes = binom.test(n_match, 12, 0.5)$p.value <= 0.01 & n_match > 6) %>% ungroup() %>% group_by(subject_id, treatment, correlationhigh) %>% summarise(suboptimalextracted = as.integer(any(passes)), .groups = "drop") extraction_stats <- Data_2L %>% distinct(subject_id, treatment, correlationhigh, trueextracted) %>% left_join(suboptimalextracted_df, by = c("subject_id", "treatment", "correlationhigh")) %>% mutate(randomizing = as.integer(trueextracted == 0 & suboptimalextracted == 0)) # By correlationhigh tbl_corr <- extraction_stats %>% group_by(correlationhigh) %>% summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) %>% mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1) ) print(tbl_corr) # By treatment tbl_treat <- extraction_stats %>% group_by(treatment) %>% summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) %>% mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1) ) print(tbl_treat) # By treatment × correlationhigh tbl_treat_corr <- extraction_stats %>% group_by(treatment, correlationhigh) %>% summarise( N = n(), true_ext = sum(trueextracted, na.rm = TRUE), subopt = sum(suboptimalextracted, na.rm = TRUE), rand = sum(randomizing, na.rm = TRUE), .groups = "drop" ) %>% mutate( true_ext_pct = round(true_ext / N * 100, 1), subopt_pct = round(subopt / N * 100, 1), rand_pct = round(rand / N * 100, 1), correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)")) ) %>% arrange(treatment, correlationhigh) print(tbl_treat_corr) # Wide version tbl_treat_corr_wide <- tbl_treat_corr %>% select(treatment, correlationhigh, N, true_ext_pct, subopt_pct, rand_pct) %>% pivot_wider( names_from = correlationhigh, values_from = c(N, true_ext_pct, subopt_pct, rand_pct), names_glue = "{.value}_{correlationhigh}" ) print(tbl_treat_corr_wide) # ---- 6a. Extraction categories: bar charts (true_extracted / suboptimal / randomizing) ---- # Same visual logic as 7b/7c but using the directly-computed extraction variables # from section 6 rather than the structural Rule_used posteriors. # Priority (mutually exclusive): True Extracted > Suboptimal > Randomizing. ext_cat_obs <- extraction_stats %>% mutate( correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)")), category = factor( case_when( trueextracted == 1 ~ "True Extracted", suboptimalextracted == 1 ~ "Suboptimal", TRUE ~ "Randomizing" ), levels = c("True Extracted", "Suboptimal", "Randomizing") ) ) # — 6a-i. By treatment × correlationhigh (mirror of p_rule_cat) — ext_cat_plot_data <- ext_cat_obs %>% mutate(treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, category) %>% summarise(n = n(), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() p_ext_cat <- ggplot(ext_cat_plot_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + facet_wrap(~ treatment_plot, nrow = 1) + labs( x = "Category", y = "% of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_ext_cat) # — 6a-ii. All treatments pooled (mirror of p_rule_cat_pooled) — # Best- and second-best rule accuracy per condition (same computation as in 7c) best_rule_acc_cond_6a <- Data_2L %>% mutate(session = as.character(session)) %>% group_by(session, treatment, correlationhigh) %>% summarise(across(starts_with("val_"), first), .groups = "drop") %>% rowwise() %>% mutate( sorted = list(sort(c_across(starts_with("val_")), decreasing = TRUE)), best_val = sorted[[1]], second_best_val = sorted[[2]], gap = best_val - second_best_val ) %>% ungroup() %>% select(-sorted) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% group_by(correlationhigh) %>% summarise(avg_best_val = mean(best_val, na.rm = TRUE), avg_gap = mean(gap, na.rm = TRUE), .groups = "drop") ext_cat_pooled_data <- ext_cat_obs %>% group_by(correlationhigh, category) %>% summarise(n = n(), .groups = "drop") %>% group_by(correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() %>% left_join(best_rule_acc_cond_6a, by = "correlationhigh") %>% mutate(bar_label = case_when( category == "True Extracted" ~ sprintf("%.0f%%", avg_best_val * 100), category == "Suboptimal" ~ sprintf("\u0394%.0fpp", avg_gap * 100), category == "Randomizing" ~ sprintf("\u0394%.0fpp", (avg_best_val - 0.5) * 100), TRUE ~ NA_character_ )) p_ext_cat_pooled <- ggplot(ext_cat_pooled_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + geom_text( aes(label = bar_label), position = position_dodge(width = 0.9), vjust = -0.4, size = 2.8, na.rm = TRUE ) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + scale_y_continuous(expand = expansion(mult = c(0, 0.12))) + labs( x = "Category", y = "Share of subjects", fill = NULL, title = "" ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_ext_cat_pooled) # ---- 7. Rule_used: posterior table and bar chart (IMPORTANT) ------------------------- RULE_POST_THRESHOLD <- 0.0 # set > 0 to keep only subjects with posterior >= threshold rule_obs <- Data_2L %>% filter(is.na(posterior) | posterior >= RULE_POST_THRESHOLD) %>% distinct(subject_id, treatment, correlationhigh, Rule_used) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) # Proportion table tbl_rule <- rule_obs %>% group_by(treatment, correlationhigh, Rule_used) %>% summarise(n = n(), .groups = "drop") %>% group_by(treatment, correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() %>% arrange(treatment, correlationhigh, desc(pct)) print(tbl_rule) # Wide tbl_rule_wide <- tbl_rule %>% select(treatment, correlationhigh, Rule_used, pct) %>% pivot_wider(names_from = correlationhigh, values_from = pct, values_fill = 0) %>% arrange(treatment, desc(Difficult + `Easy (LA)` + `Easy (HA)`)) print(tbl_rule_wide) # Bar chart all_treatments_ordered <- c("AND", "OR", "INHIBIT", "EITHER", "JOINT", "ALONE_easy", "ALONE_difficult") rule_plot_data <- tbl_rule %>% mutate(treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, Rule_used) %>% summarise(n = sum(n), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() p_rule <- ggplot(rule_plot_data, aes(x = reorder(Rule_used, -pct), y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + facet_wrap(~ treatment_plot, nrow = 1) + labs( x = "Rule used", y = "% of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 7), legend.position = "bottom" ) print(p_rule) # ---- 7b. Rule_used: collapsed 3-category bar chart # Categories: # True Extracted – Rule_used matches the treatment's true rule # (ALONE_easy → "red", ALONE_difficult → "blue") # Random – Rule_used is "random" or "always_p" # Other Rule – everything else true_rule_map <- c( AND = "AND", OR = "OR", INHIBIT = "INHIBIT", EITHER = "EITHER", JOINT = "JOINT", ALONE_easy = "red", ALONE_difficult = "blue", ALONE = "red" # session 2L(4): sound iff red ON ) rule_cat_obs <- rule_obs %>% mutate( true_rule = true_rule_map[as.character(treatment)], category = case_when( Rule_used %in% c("random", "always_p") ~ "Random", Rule_used == true_rule ~ "True Extracted", TRUE ~ "Other Rule" ), category = factor(category, levels = c("True Extracted", "Other Rule", "Random")) ) rule_cat_plot_data <- rule_cat_obs %>% mutate(treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, category) %>% summarise(n = n(), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() p_rule_cat <- ggplot(rule_cat_plot_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + facet_wrap(~ treatment_plot, nrow = 1) + labs( x = "Category", y = "% of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_rule_cat) # ---- 7c. Rule_used: collapsed 3-category, all treatments pooled () # Best- and second-best rule accuracy per (session × treatment), then gap averaged # within each condition. val_ columns are constant within a cell. best_rule_acc_cond <- Data_2L %>% mutate(session = as.character(session)) %>% group_by(session, treatment, correlationhigh) %>% summarise(across(starts_with("val_"), first), .groups = "drop") %>% rowwise() %>% mutate( sorted = list(sort(c_across(starts_with("val_")), decreasing = TRUE)), best_val = sorted[[1]], second_best_val = sorted[[2]], gap = best_val - second_best_val ) %>% ungroup() %>% select(-sorted) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% group_by(correlationhigh) %>% summarise(avg_best_val = mean(best_val, na.rm = TRUE), avg_gap = mean(gap, na.rm = TRUE), .groups = "drop") rule_cat_pooled_data <- rule_cat_obs %>% group_by(correlationhigh, category) %>% summarise(n = n(), .groups = "drop") %>% group_by(correlationhigh) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() %>% left_join(best_rule_acc_cond, by = "correlationhigh") %>% mutate(bar_label = case_when( category == "True Extracted" ~ sprintf("%.0f%%", avg_best_val * 100), category == "Other Rule" ~ sprintf("\u0394%.0fpp", avg_gap * 100), category == "Random" ~ sprintf("\u0394%.0fpp", (avg_best_val - 0.5) * 100), TRUE ~ NA_character_ )) p_rule_cat_pooled <- ggplot(rule_cat_pooled_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + geom_text( aes(label = bar_label), position = position_dodge(width = 0.9), vjust = -0.4, size = 2.8, na.rm = TRUE ) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + scale_y_continuous(expand = expansion(mult = c(0, 0.12))) + labs( x = "Category", y = "Share of subjects", fill = NULL, title = "" ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_rule_cat_pooled) # ---- 7d. Rule_used: collapsed 3-category, all treatments pooled — by education level # Join subject-level education into rule_cat_obs (one row per subject × treatment) rule_cat_edu <- rule_cat_obs %>% left_join( Data_2L %>% distinct(subject_id, education), by = "subject_id" ) %>% mutate( education = factor(education, levels = c( "High school diploma/A-levels", "Technical/community college", "Undergraduate degree (BA/BSc/other)", "Graduate degree (MA/MSc/MPhil/other)", "Doctorate degree (PhD/other)" )) ) %>% filter(!is.na(education)) # drops CONSENT_REVOKED and any unmatched values rule_cat_edu_data <- rule_cat_edu %>% group_by(correlationhigh, education, category) %>% summarise(n = n(), .groups = "drop") %>% group_by(correlationhigh, education) %>% mutate(pct = round(n / sum(n) * 100, 1)) %>% ungroup() for (cond in levels(rule_cat_edu_data$correlationhigh)) { p <- rule_cat_edu_data %>% filter(correlationhigh == cond) %>% ggplot(aes(x = category, y = pct, fill = education)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + geom_text( aes(label = paste0(pct, "%")), position = position_dodge(width = 0.9), vjust = -0.4, size = 2.8 ) + scale_y_continuous(expand = expansion(mult = c(0, 0.12))) + labs( x = "Category", y = "Share of subjects", fill = "Education level", title = cond ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom", legend.text = element_text(size = 7) ) print(p) } # ---- 8. Prior shares from structural estimation (USE POSTERIORS) -------------------------- prior_long_2L <- bind_rows( env_2L1$ESTIMATION_RESULTS %>% mutate(session = "2L(1)"), env_2L2$ESTIMATION_RESULTS %>% mutate(session = "2L(2)"), env_2L3$ESTIMATION_RESULTS %>% mutate(session = "2L(3)"), env_2L4$ESTIMATION_RESULTS %>% mutate(session = "2L(4)") ) %>% { if (!INCLUDE_AND_2L24) filter(., !(treatment == "AND" & session %in% c("2L(2)", "2L(4)"))) else . } %>% select(-loglike, -num_par) %>% pivot_longer(c(-treatment, -session), names_to = "Rule", values_to = "share") %>% filter(!is.na(share), share > 0) %>% mutate( correlationhigh = factor( case_when( treatment == "ALONE_difficult" & session == "2L(3)" ~ 0L, session == "2L(3)" ~ 2L, treatment == "AND" & session == "2L(2)" ~ 1L, treatment == "AND" & session == "2L(1)" ~ 0L, treatment == "EITHER" & session == "2L(2)" ~ 1L, treatment == "OR" & session == "2L(1)" ~ 1L, treatment == "INHIBIT"& session == "2L(1)" ~ 2L, treatment == "JOINT" & session == "2L(1)" ~ 2L, treatment == "AND" & session == "2L(4)" ~ 0L, treatment == "EITHER" & session == "2L(4)" ~ 2L, treatment == "AND" & session == "2L(1)" ~ 3L, session == "2L(4)" ~ 1L, TRUE ~ 0L ), levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)") ), treatment = factor(treatment, levels = c(all_treatments_ordered, "ALONE")), pct = round(share * 100, 1) ) # Table tbl_prior_2L <- prior_long_2L %>% arrange(treatment, session, desc(pct)) print(tbl_prior_2L) # Wide tbl_prior_2L_wide <- tbl_prior_2L %>% select(treatment, session, correlationhigh, Rule, pct) %>% pivot_wider( names_from = correlationhigh, values_from = pct, values_fill = 0 ) %>% arrange(treatment, session, desc(Difficult + `Easy (LA)` + `Easy (HA)`)) print(tbl_prior_2L_wide) # Bar chart: rules with prior share >= 5% rules_above5_2L <- prior_long_2L %>% filter(pct >= 5) %>% distinct(Rule) p_prior_2L_top <- prior_long_2L %>% semi_join(rules_above5_2L, by = "Rule") %>% mutate(treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels)) %>% ggplot(aes(x = reorder(Rule, -pct), y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + facet_wrap(~ treatment_plot, nrow = 1, scales = "free_x") + labs( x = "Rule (prior)", y = "Prior share (%)", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 7), legend.position = "bottom" ) print(p_prior_2L_top) # ---- 8b. Prior shares: collapsed 3-category bar charts (mirror of 7b/7c) prior_cat_2L <- prior_long_2L %>% mutate( true_rule = true_rule_map[as.character(treatment)], category = case_when( Rule %in% c("random", "always_p") ~ "Random", Rule == true_rule ~ "True Extracted", TRUE ~ "Other Rule" ), category = factor(category, levels = c("True Extracted", "Other Rule", "Random")) ) # — 8b-i. By treatment × correlationhigh (mirror of p_rule_cat) — prior_cat_plot_data <- prior_cat_2L %>% mutate(treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels)) %>% group_by(treatment_plot, correlationhigh, category) %>% summarise(share = sum(share), .groups = "drop") %>% group_by(treatment_plot, correlationhigh) %>% mutate(pct = round(share / sum(share) * 100, 1)) %>% ungroup() p_prior_cat <- ggplot(prior_cat_plot_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + facet_wrap(~ treatment_plot, nrow = 1) + labs( x = "Category", y = "Prior share (%)", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_prior_cat) # — 8b-ii. All treatments pooled (mirror of p_rule_cat_pooled) — prior_cat_pooled_data <- prior_cat_2L %>% group_by(correlationhigh, category) %>% summarise(share_sum = sum(share), .groups = "drop") %>% group_by(correlationhigh) %>% mutate(pct = round(share_sum / sum(share_sum) * 100, 1)) %>% ungroup() %>% left_join(best_rule_acc_cond, by = "correlationhigh") %>% mutate(bar_label = case_when( category == "True Extracted" ~ sprintf("%.0f%%", avg_best_val * 100), category == "Other Rule" ~ sprintf("\u0394%.0fpp", avg_gap * 100), category == "Random" ~ sprintf("\u0394%.0fpp", (avg_best_val - 0.5) * 100), TRUE ~ NA_character_ )) p_prior_cat_pooled <- ggplot(prior_cat_pooled_data, aes(x = category, y = pct, fill = correlationhigh)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.85) + geom_text( aes(label = bar_label), position = position_dodge(width = 0.9), vjust = -0.4, size = 2.8, na.rm = TRUE ) + scale_fill_manual(values = c("Difficult" = "#2166ac", "Easy (LA)" = "#d6604d", "Easy (HA)" = "#4dac26")) + scale_y_continuous(expand = expansion(mult = c(0, 0.12))) + labs( x = "Prior share (%)", y = "Share of subjects", fill = NULL, title = "" ) + theme_bw() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position = "bottom" ) print(p_prior_cat_pooled) # ---- 9. Types (confidence based on predicted_correct_self) ---- # Treatment-level variables (one value per subject × treatment) subj_treat_ec <- Data_2L %>% distinct(subject_id, session, treatment, correlationhigh, time_machine, predicted_correct_self, difficulty_certainty, trueextracted) # Subject-level averages across all treatments subj_avgs_ec <- subj_treat_ec %>% group_by(subject_id) %>% summarise( avg_time_machine = mean(time_machine, na.rm = TRUE), avg_predicted_correct_self = mean(predicted_correct_self, na.rm = TRUE), .groups = "drop" ) # effort: above own avg time; confidence_2: above own avg predicted_correct_self effort_conf2 <- subj_treat_ec %>% left_join(subj_avgs_ec, by = "subject_id") %>% mutate( effort = as.integer(time_machine > avg_time_machine), confidence_2 = as.integer(predicted_correct_self > avg_predicted_correct_self), factor_ec2 = factor( case_when( effort == 0 & confidence_2 == 0 ~ 1L, effort == 1 & confidence_2 == 0 ~ 2L, effort == 0 & confidence_2 == 1 ~ 3L, effort == 1 & confidence_2 == 1 ~ 4L ), levels = 1:4, labels = c("Low effort\nLow conf", "High effort\nLow conf", "Low effort\nHigh conf", "High effort\nHigh conf") ) ) # Distribution of factor_ec2 by treatment × correlationhigh p_factor_ec2 <- effort_conf2 %>% left_join( rule_cat_obs %>% select(subject_id, treatment, category) %>% mutate(treatment = as.character(treatment)), by = c("subject_id", "treatment") ) %>% mutate( treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels), correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)")) ) %>% group_by(correlationhigh, treatment_plot, factor_ec2) %>% summarise( n = n(), prop_true = mean(category == "True Extracted", na.rm = TRUE), prop_subopt = mean(category == "Other Rule", na.rm = TRUE), prop_random = mean(category == "Random", na.rm = TRUE), .groups = "drop" ) %>% group_by(correlationhigh, treatment_plot) %>% mutate(frac = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = factor_ec2, y = frac, fill = factor_ec2)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.0, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_grid(correlationhigh ~ treatment_plot, labeller = labeller(correlationhigh = corr_labels)) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs( x = NULL, y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(size = 7), legend.position = "bottom" ) print(p_factor_ec2) # Aggregated across treatments — by correlationhigh only p_factor_ec2_agg <- effort_conf2 %>% left_join( rule_cat_obs %>% select(subject_id, treatment, category) %>% mutate(treatment = as.character(treatment)), by = c("subject_id", "treatment") ) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% group_by(correlationhigh, factor_ec2) %>% summarise( n = n(), prop_true = mean(category == "True Extracted", na.rm = TRUE), prop_subopt = mean(category == "Other Rule", na.rm = TRUE), prop_random = mean(category == "Random", na.rm = TRUE), .groups = "drop" ) %>% group_by(correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = factor_ec2, y = frac, fill = factor_ec2)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.3, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_wrap(~ correlationhigh, nrow = 1, labeller = labeller(correlationhigh = corr_labels)) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs(x = NULL, y = "Fraction of subjects", fill = NULL) + theme_bw() + theme(axis.text.x = element_text(size = 7), legend.position = "bottom") print(p_factor_ec2_agg) # Distribution of factor_ec2 by session × treatment × correlationhigh p_factor_ec2_session <- effort_conf2 %>% left_join( rule_cat_obs %>% select(subject_id, treatment, category) %>% mutate(treatment = as.character(treatment)), by = c("subject_id", "treatment") ) %>% mutate( treatment_plot = factor(to_plot_treatment(as.character(treatment)), levels = plot_treatment_levels), correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)")) ) %>% group_by(session, correlationhigh, treatment_plot, factor_ec2) %>% summarise( n = n(), prop_true = mean(category == "True Extracted", na.rm = TRUE), prop_subopt = mean(category == "Other Rule", na.rm = TRUE), prop_random = mean(category == "Random", na.rm = TRUE), .groups = "drop" ) %>% group_by(session, correlationhigh, treatment_plot) %>% mutate(frac = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = factor_ec2, y = frac, fill = factor_ec2)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.0, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_grid(session + correlationhigh ~ treatment_plot, labeller = labeller(correlationhigh = corr_labels)) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs( x = NULL, y = "Fraction of subjects", fill = NULL ) + theme_bw() + theme( axis.text.x = element_text(size = 7), legend.position = "bottom" ) print(p_factor_ec2_session) # Aggregated across treatments — by session × correlationhigh p_factor_ec2_agg_session <- effort_conf2 %>% left_join( rule_cat_obs %>% select(subject_id, treatment, category) %>% mutate(treatment = as.character(treatment)), by = c("subject_id", "treatment") ) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% group_by(session, correlationhigh, factor_ec2) %>% summarise( n = n(), prop_true = mean(category == "True Extracted", na.rm = TRUE), prop_subopt = mean(category == "Other Rule", na.rm = TRUE), prop_random = mean(category == "Random", na.rm = TRUE), .groups = "drop" ) %>% group_by(session, correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = factor_ec2, y = frac, fill = factor_ec2)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.3, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_grid(session ~ correlationhigh, labeller = labeller(correlationhigh = corr_labels)) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs(x = NULL, y = "Fraction of subjects", fill = NULL) + theme_bw() + theme(axis.text.x = element_text(size = 7), legend.position = "bottom") print(p_factor_ec2_agg_session) # ---- 10. Subject level types ---- # # Each subject gets ONE type (out of four effort × confidence groups). # Benchmark: session-level mean of time_machine and predicted_correct_self, # pooled across ALL treatments and ALL correlationhigh levels within the session. # A subject is "high effort" / "high confidence" relative to peers in the same session. # Step 1: subject-level averages within session (pool all treatments and corr-high) subj_sess_avg <- subj_treat_ec %>% group_by(subject_id, session) %>% summarise( avg_time = mean(time_machine, na.rm = TRUE), avg_conf = mean(predicted_correct_self, na.rm = TRUE), avg_diff_cert = mean(difficulty_certainty, na.rm = TRUE), .groups = "drop" ) # Step 2: session-level benchmarks sess_benchmarks <- subj_sess_avg %>% group_by(session) %>% summarise( sess_avg_time = mean(avg_time, na.rm = TRUE), sess_avg_conf = mean(avg_conf, na.rm = TRUE), sess_avg_diff_cert = mean(avg_diff_cert, na.rm = TRUE), .groups = "drop" ) # Step 3: assign one type per subject # High confidence requires BOTH predicted_correct_self AND difficulty_certainty # above their respective session averages. subj_types <- subj_sess_avg %>% left_join(sess_benchmarks, by = "session") %>% mutate( effort = as.integer(avg_time > sess_avg_time), confidence = as.integer(avg_conf > sess_avg_conf & avg_diff_cert > sess_avg_diff_cert), type_ec = factor( case_when( effort == 0 & confidence == 0 ~ 1L, effort == 1 & confidence == 0 ~ 2L, effort == 0 & confidence == 1 ~ 3L, effort == 1 & confidence == 1 ~ 4L ), levels = 1:4, labels = c("Low effort\nLow conf", "High effort\nLow conf", "Low effort\nHigh conf", "High effort\nHigh conf") ) ) %>% select(subject_id, session, type_ec) # Step 4: expand to subject × correlationhigh — same type across all corr-high levels # (used for plotting by correlationhigh while keeping one type per subject) subj_corr_types <- Data_2L %>% distinct(subject_id, correlationhigh) %>% mutate(correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% left_join(subj_types, by = "subject_id") # Modal strategy proportions per subject × correlationhigh (for bar labels) cat_by_subj_corr2 <- rule_cat_obs %>% mutate(correlationhigh = factor(case_when( correlationhigh == "Difficult" ~ 0L, correlationhigh == "Easy (LA)" ~ 1L, correlationhigh == "Easy (HA)" ~ 2L ), levels = c(0L, 1L, 2L), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% group_by(subject_id, correlationhigh) %>% summarise( prop_true_modal = mean(category == "True Extracted"), prop_subopt_modal = mean(category == "Other Rule"), prop_random_modal = mean(category == "Random"), .groups = "drop" ) subj_corr_types <- subj_corr_types %>% left_join(cat_by_subj_corr2, by = c("subject_id", "correlationhigh")) # ---- 10a. Per-session histograms by correlationhigh ---- # For each session: fraction of subjects in each type, split by correlationhigh. # Types are session-specific (benchmark = session mean). sess_corr_plot_data <- subj_corr_types %>% group_by(session, correlationhigh, type_ec) %>% summarise( n = n(), prop_true = mean(prop_true_modal, na.rm = TRUE), prop_subopt = mean(prop_subopt_modal, na.rm = TRUE), prop_random = mean(prop_random_modal, na.rm = TRUE), .groups = "drop" ) %>% group_by(session, correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() p_factor_ec_session <- sess_corr_plot_data %>% ggplot(aes(x = type_ec, y = frac, fill = type_ec)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.3, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_grid(session ~ correlationhigh) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs( x = NULL, y = "Fraction of subjects", fill = NULL, title = "Subject types by session and difficulty", caption = "One type per subject; benchmark = session mean (pooled across all treatments and difficulty levels).\nBar labels: E = Extracted, S = Suboptimal (Other Rule), R = Random (posterior modal strategy)." ) + theme_bw() + theme( axis.text.x = element_text(size = 7), legend.position = "bottom" ) print(p_factor_ec_session) # ---- 10b. All sessions pooled, by correlationhigh ---- # Pool all sessions together. For each correlationhigh level, show the fraction # of subjects (with their session-assigned type) in each group. pooled_corr_plot_data <- subj_corr_types %>% group_by(correlationhigh, type_ec) %>% summarise( n = n(), prop_true = mean(prop_true_modal, na.rm = TRUE), prop_subopt = mean(prop_subopt_modal, na.rm = TRUE), prop_random = mean(prop_random_modal, na.rm = TRUE), .groups = "drop" ) %>% group_by(correlationhigh) %>% mutate(frac = n / sum(n)) %>% ungroup() p_factor_ec_pooled <- pooled_corr_plot_data %>% ggplot(aes(x = type_ec, y = frac, fill = type_ec)) + geom_bar(stat = "identity", alpha = 0.85, width = 0.7) + geom_text(aes(label = paste0( "E: ", scales::percent(prop_true, accuracy = 1), "\n", "S: ", scales::percent(prop_subopt, accuracy = 1), "\n", "R: ", scales::percent(prop_random, accuracy = 1) )), vjust = -0.2, size = 2.3, lineheight = 0.85) + scale_fill_brewer(palette = "Set2") + facet_wrap(~ correlationhigh, nrow = 1) + scale_y_continuous(labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.35))) + labs( x = NULL, y = "Fraction of subjects", fill = NULL, title = "Subject types by difficulty (all sessions pooled)", caption = "One type per subject; benchmark = session mean (pooled across all treatments and difficulty levels).\nBar labels: E = Extracted, S = Suboptimal (Other Rule), R = Random (posterior modal strategy)." ) + theme_bw() + theme( axis.text.x = element_text(size = 7), legend.position = "bottom" ) print(p_factor_ec_pooled) # effort_conf_corr: kept for section 10c consistency check (global benchmark, # one row per subject × correlationhigh — different from subj_types above) subj_corr_ec <- subj_treat_ec %>% group_by(subject_id, correlationhigh) %>% summarise( avg_time_corr = mean(time_machine, na.rm = TRUE), avg_conf_corr = mean(predicted_correct_self, na.rm = TRUE), .groups = "drop" ) overall_benchmarks <- subj_corr_ec %>% summarise( overall_avg_time = mean(avg_time_corr, na.rm = TRUE), overall_avg_conf = mean(avg_conf_corr, na.rm = TRUE) ) effort_conf_corr <- subj_corr_ec %>% mutate( effort_corr = as.integer(avg_time_corr > overall_benchmarks$overall_avg_time), confidence_corr = as.integer(avg_conf_corr > overall_benchmarks$overall_avg_conf), factor_ec_corr = factor( case_when( effort_corr == 0 & confidence_corr == 0 ~ 1L, effort_corr == 1 & confidence_corr == 0 ~ 2L, effort_corr == 0 & confidence_corr == 1 ~ 3L, effort_corr == 1 & confidence_corr == 1 ~ 4L ), levels = 1:4, labels = c("Low effort\nLow conf", "High effort\nLow conf", "Low effort\nHigh conf", "High effort\nHigh conf") ) ) %>% left_join( rule_cat_obs %>% mutate(correlationhigh_int = case_when( correlationhigh == "Difficult" ~ 0L, correlationhigh == "Easy (LA)" ~ 1L, correlationhigh == "Easy (HA)" ~ 2L )) %>% group_by(subject_id, correlationhigh_int) %>% summarise( prop_true_modal = mean(category == "True Extracted"), prop_subopt_modal = mean(category == "Other Rule"), prop_random_modal = mean(category == "Random"), .groups = "drop" ), by = c("subject_id", "correlationhigh" = "correlationhigh_int") ) # ---- 10c. Type consistency across correlationhigh levels ---- # # For each subject that appears in more than one correlationhigh level, # check whether they land in the same effort × confidence group. # The main overlap is correlationhigh 0 (Low) vs 1 (High), from sessions # 2L(1) and 2L(2). Exp3 subjects mostly appear only in level 2. # Wide: one row per subject, one column per correlationhigh level type_wide <- effort_conf_corr %>% select(subject_id, correlationhigh, factor_ec_corr) %>% mutate(corr_label = corr_labels[as.character(correlationhigh)]) %>% select(-correlationhigh) %>% pivot_wider(names_from = corr_label, values_from = factor_ec_corr, names_prefix = "type_") # Consistency summary: subjects with both Difficult and Easy (LA) assignments consistency_summary <- type_wide %>% filter(!is.na(type_Difficult) & !is.na(`type_Easy (LA)`)) %>% summarise( n_subjects = n(), n_same = sum(type_Difficult == `type_Easy (LA)`), pct_same = round(100 * n_same / n_subjects, 1) ) cat("\n================================================================\n") cat(" TYPE CONSISTENCY: Difficult vs Easy (LA) correlationhigh\n") cat("================================================================\n") cat(sprintf(" Subjects with both levels : %d\n", consistency_summary$n_subjects)) cat(sprintf(" Same group in both levels : %d (%.1f%%)\n", consistency_summary$n_same, consistency_summary$pct_same)) cat("================================================================\n\n") # Cross-tab heatmap: type at Difficult (x-axis) vs type at Easy (LA) (y-axis) cross_tab_corr <- type_wide %>% filter(!is.na(type_Difficult) & !is.na(`type_Easy (LA)`)) %>% count(type_Difficult, `type_Easy (LA)`) %>% group_by(type_Difficult) %>% mutate(pct_row = n / sum(n)) %>% ungroup() p_type_consistency <- cross_tab_corr %>% ggplot(aes(x = type_Difficult, y = `type_Easy (LA)`, fill = n)) + geom_tile(colour = "white", linewidth = 0.6) + geom_text(aes(label = sprintf("%d\n(%.0f%%)", n, 100 * pct_row)), size = 3) + scale_fill_gradient(low = "#f7f7f7", high = "#2166ac") + labs( x = "Type at Difficult correlationhigh", y = "Type at Easy (LA) correlationhigh", fill = "N subjects", title = "Type consistency across correlationhigh levels (Difficult vs Easy (LA))", caption = "Cell count + % of row total. Diagonal = same type in both conditions." ) + theme_bw() + theme(axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 7)) print(p_type_consistency) # ---- 17c. Variance decomposition: trueextracted / other_rule / random # ---- 11. Variance decomposition ---- # # Outcomes (from section 7b rule_cat_obs): # trueextracted – subject inferred the true rule (category == "True Extracted") # other_rule – subject inferred a different rule (category == "Other Rule") # random – subject classified as random/always_p (category == "Random") # # Predictors: # subject_id – individual fixed effects (absorbs all between-subject heterogeneity) # correlationhigh – correlation condition (Low / High / Exp3) # treatment – specific logical rule (AND, OR, EITHER, …) # # OLS is used for straightforward variance decomposition on binary outcomes. # Type I (sequential) SS entered: subject_id → correlationhigh → treatment # eta2 = SS_term / SS_total (absolute share of total variance) # partial_eta2 = SS_term / (SS_term + SS_res) (share net of other predictors) # # Note: subject_id is entered first so that correlationhigh and treatment # capture only the within-subject variance they explain. # Build outcome dataset from section 7b rule_cat_obs reg_data_cat <- rule_cat_obs %>% mutate( trueextracted = as.integer(category == "True Extracted"), other_rule = as.integer(category == "Other Rule"), random = as.integer(category == "Random"), subject_id = factor(subject_id), treatment = factor(treatment) ) # Helper: Type I ANOVA → variance decomposition table # subject_id spans many levels; its rows are collapsed into one summary row. anova_decomp <- function(model) { a <- anova(model) ss <- a$`Sum Sq` ss_res <- tail(ss, 1) ss_tot <- sum(ss) full <- tibble( Predictor = rownames(a), df = a$Df, SS = ss, F_stat = a$`F value`, p_value = a$`Pr(>F)`, `eta2_%` = 100 * ss / ss_tot, `partial_eta2_%` = 100 * ifelse(rownames(a) == "Residuals", NA_real_, ss / (ss + ss_res)) ) # Collapse all subject_id levels into a single row subj_rows <- full %>% filter(grepl("^subject_id", Predictor)) other_rows <- full %>% filter(!grepl("^subject_id", Predictor)) if (nrow(subj_rows) > 0) { subj_summary <- tibble( Predictor = "subject_id", df = sum(subj_rows$df), SS = sum(subj_rows$SS), F_stat = NA_real_, p_value = NA_real_, `eta2_%` = sum(subj_rows$`eta2_%`), `partial_eta2_%` = 100 * sum(subj_rows$SS) / (sum(subj_rows$SS) + tail(full$SS, 1)) ) full <- bind_rows(subj_summary, other_rows) } full %>% mutate( SS = round(SS, 4), F_stat = round(F_stat, 3), p_value = round(p_value, 4), `eta2_%` = round(`eta2_%`, 2), `partial_eta2_%` = round(`partial_eta2_%`, 2) ) } # Fit three OLS models (subject_id entered first) outcomes <- c("trueextracted", "other_rule", "random") models_cat <- setNames( lapply(outcomes, function(y) lm(as.formula(paste(y, "~ subject_id + correlationhigh + treatment")), data = reg_data_cat)), outcomes ) # Print results for each outcome for (y in outcomes) { m <- models_cat[[y]] tbl <- anova_decomp(m) sm <- summary(m) cat("\n================================================================\n") cat(sprintf(" VARIANCE DECOMPOSITION — %s\n", toupper(gsub("_", " ", y)))) cat("================================================================\n") cat(sprintf(" R² = %.3f | Adj R² = %.3f | N = %d\n", sm$r.squared, sm$adj.r.squared, nobs(m))) cat("----------------------------------------------------------------\n") cat(" VARIANCE EXPLAINED BY PREDICTOR (η² and partial η² in %)\n") cat("----------------------------------------------------------------\n") print(as.data.frame(tbl), row.names = FALSE) cat("----------------------------------------------------------------\n") cat(" eta2 = SS/SS_total; partial eta2 = SS/(SS + SS_residual)\n") cat(" Type I SS: subject_id → correlationhigh → treatment\n") cat("================================================================\n") } # ---- 11b. Variance decomposition (cost types) ------------ # factor_ec_corr classifies each subject × correlationhigh cell as one of: # 1 – Low effort / Low conf (below-avg time AND below-avg predicted_correct_self) # 2 – High effort / Low conf # 3 – Low effort / High conf # 4 – High effort / High conf # Classification benchmark is the group average within each correlationhigh level. # Joining on subject_id × correlationhigh assigns the correct type per treatment row. # # Type I SS entered: factor_ec_corr → correlationhigh → treatment reg_data_type <- reg_data_cat %>% left_join( effort_conf_corr %>% mutate(subject_id = factor(subject_id), correlationhigh = factor(correlationhigh, levels = c(0, 1, 2), labels = c("Difficult", "Easy (LA)", "Easy (HA)"))) %>% select(subject_id, correlationhigh, factor_ec_corr), by = c("subject_id", "correlationhigh") ) # Fit three OLS models with subject type instead of subject fixed effects models_type <- setNames( lapply(outcomes, function(y) lm(as.formula(paste(y, "~ factor_ec_corr + correlationhigh + treatment")), data = reg_data_type)), outcomes ) cat("\n################################################################\n") cat(" 17c-ii SUBJECT TYPE (effort × confidence) INSTEAD OF SUBJECT ID\n") cat("################################################################\n") for (y in outcomes) { m <- models_type[[y]] tbl <- anova_decomp(m) sm <- summary(m) cat("\n================================================================\n") cat(sprintf(" VARIANCE DECOMPOSITION — %s\n", toupper(gsub("_", " ", y)))) cat("================================================================\n") cat(sprintf(" R² = %.3f | Adj R² = %.3f | N = %d\n", sm$r.squared, sm$adj.r.squared, nobs(m))) cat("----------------------------------------------------------------\n") cat(" VARIANCE EXPLAINED BY PREDICTOR (η² and partial η² in %)\n") cat("----------------------------------------------------------------\n") print(as.data.frame(tbl), row.names = FALSE) cat("----------------------------------------------------------------\n") cat(" eta2 = SS/SS_total; partial eta2 = SS/(SS + SS_residual)\n") cat(" Type I SS: factor_ec_corr → correlationhigh → treatment\n") cat("================================================================\n") # Coefficient table (manageable size with 4-level factor) coef_tbl <- as.data.frame(sm$coefficients) coef_tbl <- data.frame( Term = rownames(coef_tbl), Estimate = round(coef_tbl[, "Estimate"], 4), Std_Error = round(coef_tbl[, "Std. Error"], 4), t_value = round(coef_tbl[, "t value"], 3), p_value = round(coef_tbl[, "Pr(>|t|)"], 4) ) cat("\n REGRESSION COEFFICIENTS\n") cat("----------------------------------------------------------------\n") print(coef_tbl, row.names = FALSE) cat("----------------------------------------------------------------\n") } # ---- 11c. ICC of trueextracted across correlation conditions (THIS IS THE RELEVANT ONE) ------------------- # Unit of observation: subject. # Value per subject × condition: proportion of treatments where they extracted # the true rule (prop_te). ICC treats the 3 conditions as fixed raters and # asks how consistently subjects are ranked across all three simultaneously. # # model = "twoway" : conditions are fixed (not a random sample) # type = "consistency" : ranks matter, not absolute level shifts across conditions # unit = "single" : one condition = one observation per subject #install.packages("irr") library(irr) icc_data <- rule_cat_obs %>% group_by(subject_id, correlationhigh) %>% summarise(prop_te = mean(category == "True Extracted"), .groups = "drop") %>% pivot_wider(names_from = correlationhigh, values_from = prop_te) %>% select(-subject_id) icc_result <- irr::icc(icc_data, model = "twoway", type = "consistency", unit = "single") cat("\n================================================================\n") cat(" ICC — CONSISTENCY OF TRUE EXTRACTION ACROSS CORRELATION CONDITIONS\n") cat("================================================================\n") cat(sprintf(" ICC(2,1) consistency = %.3f [95%% CI: %.3f – %.3f]\n", icc_result$value, icc_result$lbound, icc_result$ubound)) cat(sprintf(" F(%d, %d) = %.3f, p = %.4f\n", icc_result$df1, icc_result$df2, icc_result$Fvalue, icc_result$p.value)) cat(sprintf(" N subjects = %d, conditions = %s\n", icc_result$subjects, paste(names(icc_data), collapse = " / "))) cat("================================================================\n") # ---- 12. Subject filter for posterior ------------------------------------------- # Keep only subjects whose posterior probability for their assigned strategy # (Rule_used) is at least KAPPA_ACC_THRESHOLD. Adjust as needed. KAPPA_ACC_THRESHOLD <- 0.00 # e.g. 0.80, 0.85, 0.90 Data_2L_18 <- Data_2L |> filter(!is.na(posterior), posterior >= KAPPA_ACC_THRESHOLD) cat(sprintf( "Section 18 subject filter (posterior >= %.0f%%): %d of %d subjects retained\n", KAPPA_ACC_THRESHOLD * 100, n_distinct(Data_2L_18$subject_id), n_distinct(Data_2L$subject_id) )) # ---- 12a. Strategy cost estimation (IMPORTANT) ------ # # For each (session x treatment) cell D with k strategies (rho > 0), # set up the k(k-1)/2 pairwise equations: # v^D(pi_i) - v^D(pi_j) = rho^D(pi_i)*kappa(pi_i) - rho^D(pi_j)*kappa(pi_j) # This is the linear system A*kappa = b, rank k-1. # The unique minimum-norm solution (SVD pseudoinverse) requires no # normalisation: it sets sum(kappa_i / rho_i) = 0 implicitly. # # v^D(pi) = val_ from Data_2L (session-specific accuracy). # rho^D(pi) = posterior MAP share (Rule_used) per (session x treatment) cell, # computed on subjects passing the KAPPA_ACC_THRESHOLD filter. # # ---- 18a. v^D(pi) from val_ columns in Data_2L # val_ is constant within (session x treatment); take first value per cell. val_long_18 <- Data_2L |> mutate(session = as.character(session)) |> group_by(session, treatment) |> summarise(across(starts_with("val_"), first), .groups = "drop") |> pivot_longer(starts_with("val_"), names_to = "strategy_val", values_to = "v") |> mutate(strategy = str_remove(strategy_val, "^val_")) |> select(session, treatment, strategy, v) cat(sprintf("val_long_18: %d rows | sessions: %s\n", nrow(val_long_18), paste(sort(unique(val_long_18$session)), collapse = ", "))) # ---- 18b. Posterior shares rho^D(pi) from MAP assignment (Rule_used) # rho = proportion of subjects (passing the accuracy filter) assigned to each # strategy in each (session x treatment) cell. shares_long_18 <- Data_2L_18 |> mutate(session = as.character(session)) |> distinct(subject_id, session, treatment, Rule_used) |> filter(!is.na(Rule_used), Rule_used != "always_p") |> count(session, treatment, strategy = Rule_used, name = "n_subjects") |> group_by(session, treatment) |> mutate(rho = n_subjects / sum(n_subjects)) |> filter(rho > 0.05) |> mutate(rho = rho / sum(rho)) |> # re-normalise after dropping <5% strategies ungroup() |> select(session, treatment, strategy, rho) cat(sprintf("shares_long_18: %d rows | sessions: %s\n", nrow(shares_long_18), paste(sort(unique(shares_long_18$session)), collapse = ", "))) # ---- 18c. Join v and rho per (session x treatment x strategy cost_data_v_18 <- shares_long_18 |> inner_join(val_long_18, by = c("session", "treatment", "strategy")) |> filter(!is.na(v)) cat(sprintf("cost_data_v_18: %d rows | sessions: %s\n", nrow(cost_data_v_18), paste(sort(unique(cost_data_v_18$session)), collapse = ", "))) # ---- 18d. Solver: minimum-norm + non-negative shift via SVD pseudoinverse # For k strategies in one D cell, builds the k(k-1)/2 x k design matrix A # and RHS b, then returns kappa = A^+ b shifted to enforce kappa >= 0. # # The null space of A is spanned by (1/rho_1, ..., 1/rho_k): adding alpha/rho_i # to every kappa_i leaves all residuals unchanged (A*n = 0). The minimum # non-negative shift is alpha* = max(0, -min(kappa * rho)), which brings the # most-negative weighted cost up to exactly 0. This is equivalent to NNLS # for rank-deficient A: same residuals, guaranteed kappa >= 0. solve_kappa_cell <- function(strats, v_vec, rho_vec) { k <- length(strats) if (k == 1L) return(tibble(strategy = strats, kappa = NA_real_)) pairs <- combn(k, 2) np <- ncol(pairs) A <- matrix(0, np, k) b <- numeric(np) for (p in seq_len(np)) { i <- pairs[1, p]; j <- pairs[2, p] A[p, i] <- rho_vec[i] A[p, j] <- -rho_vec[j] b[p] <- v_vec[i] - v_vec[j] } sv <- svd(A) tol <- max(dim(A)) * max(sv$d) * .Machine$double.eps nz <- sv$d > tol d_inv <- 1 / sv$d[nz] A_plus <- sv$v[, nz, drop = FALSE] %*% (d_inv * t(sv$u[, nz, drop = FALSE])) kappa <- as.numeric(A_plus %*% b) # Shift along null direction to enforce kappa >= 0 alpha <- max(0, -min(kappa * rho_vec)) kappa <- kappa + alpha / rho_vec tibble(strategy = strats, kappa = kappa) } # ---- 18e. Solve per (session x treatment) cell kappa_solved_18 <- cost_data_v_18 |> group_by(session, treatment) |> summarise( kappa_df = list(solve_kappa_cell(strategy, v, rho)), .groups = "drop" ) |> unnest(kappa_df) # correlationhigh lookup from Data_2L (3 levels: 0 = Low, 1 = High, 2 = Exp3). # Re-uses the assignment already defined in section 5, avoiding duplication. corr_lookup_18 <- Data_2L |> distinct(session = as.character(session), treatment, correlationhigh) cost_data_18 <- cost_data_v_18 |> left_join(kappa_solved_18, by = c("session", "treatment", "strategy")) |> left_join(corr_lookup_18, by = c("session", "treatment")) |> mutate( correlationhigh = factor(correlationhigh, levels = c(0L, 1L, 2L), labels = c("Difficult", "Easy (LA)", "Easy (HA)")) ) # ---- 18f. Results table cat(sprintf( "\nSection 18 -- %d rows | sessions: %s | %d strategies\n", nrow(cost_data_18), paste(sort(unique(cost_data_18$session)), collapse = ", "), n_distinct(cost_data_18$strategy) )) cat("\n=== Section 18: Strategy cost table (min-norm pairwise system) ===\n") print( cost_data_18 |> select(session, treatment, correlationhigh, strategy, rho, v, kappa) |> arrange(treatment, session, desc(rho)) |> mutate(across(c(rho, v, kappa), ~ round(.x, 4))) ) # ---- 18g. Scatter: estimated kappa grouped by strategy family # Strategies are pooled into logical-operator families. # blue_alone (sound iff blue on, red off) is grouped with INHIBIT as its # blue-primary counterpart; not_blue_alone is grouped with not-INHIBIT. # The red dot + error bar shows mean ± SE across all treatment × session # observations within the family (after outlier removal). scatter_data_18g <- cost_data_18 |> filter(!is.na(kappa)) |> mutate( strategy_group = case_when( strategy %in% c("always_0", "always_1") ~ NA_character_, strategy %in% c("red", "not_red", "blue", "not_blue") ~ "1-variable", strategy %in% c("INHIBIT", "blue_alone") ~ "INHIBIT", str_detect(strategy, "^AND") ~ "AND", str_detect(strategy, "^EITHER") ~ "EITHER", str_detect(strategy, "^JOINT") ~ "JOINT", TRUE ~ NA_character_ ) ) |> filter(!is.na(strategy_group)) |> group_by(strategy_group) |> filter({ iqr <- IQR(kappa) med <- median(kappa) kappa >= med - 1.5 * iqr & kappa <= med + 1.5 * iqr }) |> ungroup() scatter_summary_18g <- scatter_data_18g |> group_by(strategy_group) |> summarise( mean_kappa = mean(kappa), se_kappa = sd(kappa) / sqrt(n()), .groups = "drop" ) ggplot(scatter_data_18g, aes(x = strategy_group, y = kappa)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") + geom_jitter(aes(colour = treatment, shape = session), width = 0.15, height = 0, size = 2.5, alpha = 0.85) + geom_point(data = scatter_summary_18g, aes(x = strategy_group, y = mean_kappa), colour = "red", size = 3.5, inherit.aes = FALSE) + geom_errorbar(data = scatter_summary_18g, aes(x = strategy_group, ymin = mean_kappa - se_kappa, ymax = mean_kappa + se_kappa), colour = "red", width = 0.25, linewidth = 0.9, inherit.aes = FALSE) + scale_colour_brewer(palette = "Set1") + labs( x = "Strategy family", y = expression(hat(kappa)), colour = "Treatment", shape = "Session", title = "" ) + theme_bw() + theme(axis.text.x = element_text(angle = 40, hjust = 1)) # ---- 13. Strategy cost estimation - PRIOR ---------------- # # Identical in structure to Section 18, but rho^D(pi) comes from the # structural-estimation prior (ESTIMATION_RESULTS) rather than from # individual MAP assignments. No subject-level accuracy filter is needed. # # v^D(pi) = val_ from Data_2L (same as Section 18). # rho^D(pi) = prior share from prior_long_2L (Section 14). # # ---- 19a. v^D(pi) — reuse val_long_18 (same sessions / treatments) val_long_19 <- val_long_18 # already computed in 18a # ---- 19b. Prior shares rho^D(pi) from ESTIMATION_RESULTS shares_long_19 <- prior_long_2L |> select(session, treatment, strategy = Rule, rho = share) |> mutate(session = as.character(session), treatment = as.character(treatment)) |> filter(!is.na(rho), strategy != "always_p") |> group_by(session, treatment) |> filter(rho > 0) |> mutate(rho = rho / sum(rho)) |> # re-normalise after dropping <5% strategies ungroup() cat(sprintf("shares_long_19: %d rows | sessions: %s\n", nrow(shares_long_19), paste(sort(unique(shares_long_19$session)), collapse = ", "))) # ---- 19c. Join v and rho per (session x treatment x strategy) cost_data_v_19 <- shares_long_19 |> mutate(session = as.character(session), treatment = as.character(treatment)) |> inner_join( val_long_19 |> mutate(session = as.character(session), treatment = as.character(treatment)), by = c("session", "treatment", "strategy") ) |> filter(!is.na(v)) cat(sprintf("cost_data_v_19: %d rows | sessions: %s\n", nrow(cost_data_v_19), paste(sort(unique(cost_data_v_19$session)), collapse = ", "))) # ---- 19d. Solver: reuses solve_kappa_cell() defined in Section 18d # ---- 19e. Solve per (session x treatment) cell kappa_solved_19 <- cost_data_v_19 |> group_by(session, treatment) |> summarise( kappa_df = list(solve_kappa_cell(strategy, v, rho)), .groups = "drop" ) |> unnest(kappa_df) corr_lookup_19 <- Data_2L |> distinct(session = as.character(session), treatment = as.character(treatment), correlationhigh) cost_data_19 <- cost_data_v_19 |> left_join(kappa_solved_19, by = c("session", "treatment", "strategy")) |> left_join(corr_lookup_19, by = c("session", "treatment")) |> mutate( correlationhigh = factor(correlationhigh, levels = c(0L, 1L, 2L), labels = c("Difficult", "Easy (LA)", "Easy (HA)")) ) # ---- 19f. Results table cat(sprintf( "\nSection 19 -- %d rows | sessions: %s | %d strategies\n", nrow(cost_data_19), paste(sort(unique(cost_data_19$session)), collapse = ", "), n_distinct(cost_data_19$strategy) )) cat("\n=== Section 19: Strategy cost table — prior shares (min-norm pairwise system) ===\n") print( cost_data_19 |> select(session, treatment, correlationhigh, strategy, rho, v, kappa) |> arrange(treatment, session, desc(rho)) |> mutate(across(c(rho, v, kappa), ~ round(.x, 4))) ) # ---- 19g. Scatter: estimated kappa grouped by strategy family (prior-based) scatter_data_19g <- cost_data_19 |> filter(!is.na(kappa)) |> mutate( strategy_group = case_when( strategy %in% c("always_0", "always_1") ~ NA_character_, strategy %in% c("red", "not_red", "blue", "not_blue") ~ "1-variable", strategy %in% c("INHIBIT", "blue_alone") ~ "INHIBIT", str_detect(strategy, "^AND") ~ "AND", str_detect(strategy, "^EITHER") ~ "EITHER", str_detect(strategy, "^JOINT") ~ "JOINT", TRUE ~ NA_character_ ) ) |> filter(!is.na(strategy_group)) |> group_by(strategy_group) |> filter({ iqr <- IQR(kappa) med <- median(kappa) kappa >= med - 1.5 * iqr & kappa <= med + 1.5 * iqr }) |> ungroup() scatter_summary_19g <- scatter_data_19g |> group_by(strategy_group) |> summarise( mean_kappa = mean(kappa), se_kappa = sd(kappa) / sqrt(n()), .groups = "drop" ) ggplot(scatter_data_19g, aes(x = strategy_group, y = kappa)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") + geom_jitter(aes(colour = treatment, shape = session), width = 0.15, height = 0, size = 2.5, alpha = 0.85) + geom_point(data = scatter_summary_19g, aes(x = strategy_group, y = mean_kappa), colour = "red", size = 3.5, inherit.aes = FALSE) + geom_errorbar(data = scatter_summary_19g, aes(x = strategy_group, ymin = mean_kappa - se_kappa, ymax = mean_kappa + se_kappa), colour = "red", width = 0.25, linewidth = 0.9, inherit.aes = FALSE) + scale_colour_brewer(palette = "Set1") + labs( x = "Strategy family", y = expression(hat(kappa)), colour = "Treatment", shape = "Session", title = "Estimated strategy cost by strategy family (prior shares)" ) + theme_bw() + theme(axis.text.x = element_text(angle = 40, hjust = 1)) # ---- 14. Scatter: P(rule) vs. distance from highest-accuracy rule ---------- # # Unit of observation: one (session × treatment × strategy) triple. # Every strategy with non-zero posterior share in a given cell is one point. # # Why not just the modal rule? # The true rule always has val = 1.0 (the machine IS the true rule), so # max_val = 1.0 in every cell and the modal rule (= true rule in most cells) # always has distance 0 — collapsing the plot to a single vertical strip. # Plotting all strategies reveals the full rho–accuracy relationship. # # Y axis — rho: posterior MAP share of the strategy in that cell # (from shares_long_18, i.e. subjects with posterior >= KAPPA_ACC_THRESHOLD). # # X axis — distance: max accuracy in the cell minus this strategy's accuracy. # distance = max(val_*) − val_{strategy} # = 0 → strategy is as accurate as the best possible rule # > 0 → strategy is accuracy-inferior; subjects who use it leave # "accuracy on the table" # # Points are coloured by treatment and shaped by session. # 20a. Best accuracy per (session × treatment) across all 16 rules max_val_20 <- val_long_18 |> group_by(session, treatment) |> summarise(max_val = max(v, na.rm = TRUE), .groups = "drop") # 20b. Join rho and val for every strategy present in each cell, # then compute distance from the best rule. scatter_20 <- shares_long_18 |> inner_join(val_long_18, by = c("session", "treatment", "strategy")) |> inner_join(max_val_20, by = c("session", "treatment")) |> mutate( distance = max_val - v, correlationhigh = factor( case_when( session == "2L(1)" & treatment %in% c("AND", "EITHER") ~ 0L, session == "2L(1)" & treatment %in% c("OR", "INHIBIT", "JOINT") ~ 1L, session == "2L(2)" & treatment %in% c("OR", "INHIBIT", "JOINT") ~ 0L, session == "2L(2)" & treatment %in% c("AND", "EITHER") ~ 1L, session %in% c("2L(3)", "2L(4)") ~ 2L, TRUE ~ NA_integer_ ), levels = c(0L, 1L, 2L), labels = c("Difficult", "Easy (LA)", "Easy (HA)") ) ) cat(sprintf("scatter_20: %d points | sessions: %s | %d strategies\n", nrow(scatter_20), paste(sort(unique(scatter_20$session)), collapse = ", "), n_distinct(scatter_20$strategy))) cat("\n=== Section 20: Rule share and distance from best rule ===\n") print( scatter_20 |> select(session, treatment, strategy, rho, v, max_val, distance) |> arrange(treatment, session, distance) |> mutate(across(c(rho, v, max_val, distance), ~ round(.x, 4))) ) # 20c. OLS regression: rho ~ distance lm_20 <- lm(rho ~ distance, data = scatter_20) sm_20 <- summary(lm_20) cat("\n=== Section 20: OLS regression rho ~ distance ===\n") cat(sprintf(" Intercept : %.4f (SE = %.4f, t = %.3f, p = %.4f)\n", coef(sm_20)[1, 1], coef(sm_20)[1, 2], coef(sm_20)[1, 3], coef(sm_20)[1, 4])) cat(sprintf(" distance : %.4f (SE = %.4f, t = %.3f, p = %.4f)\n", coef(sm_20)[2, 1], coef(sm_20)[2, 2], coef(sm_20)[2, 3], coef(sm_20)[2, 4])) cat(sprintf(" R\u00b2 = %.3f | Adj R\u00b2 = %.3f | N = %d\n", sm_20$r.squared, sm_20$adj.r.squared, nobs(lm_20))) # 20d. Plot with best-fit line p_20 <- ggplot(scatter_20, aes(x = distance, y = rho)) + geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60") + geom_point(aes(colour = treatment, shape = session), size = 2.5, alpha = 0.85) + geom_smooth(method = "lm", formula = y ~ x, colour = "black", linewidth = 0.9, se = TRUE) + scale_colour_brewer(palette = "Set1") + scale_x_continuous( name = "Distance from highest-accuracy rule\n(max accuracy \u2212 strategy accuracy)", limits = c(0, NA), expand = expansion(mult = c(0.02, 0.08)) ) + scale_y_continuous( name = "Posterior share \u03c1", limits = c(0, 1) ) + labs( title = "", subtitle = sprintf("\u03b2(distance) = %.3f, p = %.4f, R\u00b2 = %.3f", coef(lm_20)[["distance"]], coef(sm_20)[2, 4], sm_20$r.squared), colour = "Treatment", shape = "Session" ) + theme_bw() + theme( legend.position = "right", axis.title = element_text(size = 9), plot.subtitle = element_text(size = 8, colour = "grey30") ) print(p_20) # ---- 15. Scatter: P(rule) vs. distance — suboptimal rules only (distance > 0) ---- # # Same as section 14 but restricted to strategies whose accuracy is strictly # below the best rule in the cell (distance > 0). Dropping the true-rule # points (distance = 0, rho typically highest) lets the regression capture # how the share of *suboptimal* strategies varies with their accuracy gap. scatter_15 <- scatter_20 |> filter(distance > 0) cat(sprintf("\nscatter_15: %d points (distance > 0) | %d strategies\n", nrow(scatter_15), n_distinct(scatter_15$strategy))) # 15a. OLS regression on suboptimal-rule points only lm_15 <- lm(rho ~ distance, data = scatter_15) sm_15 <- summary(lm_15) cat("\n=== Section 15: OLS regression rho ~ distance (distance > 0 only) ===\n") cat(sprintf(" Intercept : %.4f (SE = %.4f, t = %.3f, p = %.4f)\n", coef(sm_15)[1, 1], coef(sm_15)[1, 2], coef(sm_15)[1, 3], coef(sm_15)[1, 4])) cat(sprintf(" distance : %.4f (SE = %.4f, t = %.3f, p = %.4f)\n", coef(sm_15)[2, 1], coef(sm_15)[2, 2], coef(sm_15)[2, 3], coef(sm_15)[2, 4])) cat(sprintf(" R\u00b2 = %.3f | Adj R\u00b2 = %.3f | N = %d\n", sm_15$r.squared, sm_15$adj.r.squared, nobs(lm_15))) # 15b. Plot p_15 <- ggplot(scatter_15, aes(x = distance, y = rho)) + geom_point(aes(colour = treatment, shape = session), size = 2.5, alpha = 0.85) + geom_smooth(method = "lm", formula = y ~ x, colour = "black", linewidth = 0.9, se = TRUE) + scale_colour_brewer(palette = "Set1") + scale_x_continuous( name = "Distance from highest-accuracy rule\n(max accuracy \u2212 strategy accuracy)", expand = expansion(mult = c(0.02, 0.08)) ) + scale_y_continuous( name = "Posterior share \u03c1 [P(rule extracted) in treatment \u00d7 session]", limits = c(0, 1) ) + labs( title = "Rule extraction probability vs. accuracy gap — suboptimal rules only", subtitle = sprintf("\u03b2(distance) = %.3f, p = %.4f, R\u00b2 = %.3f (distance > 0 only)", coef(lm_15)[["distance"]], coef(sm_15)[2, 4], sm_15$r.squared), colour = "Treatment", shape = "Session" ) + theme_bw() + theme( legend.position = "right", axis.title = element_text(size = 9), plot.subtitle = element_text(size = 8, colour = "grey30") ) print(p_15)