import itertools import os import pandas as pd # TO DOs --------------------------------------------------------------------------------------------------------------- # need to set one (the same) number of choice-only trials # create 200 or so sequences # (--> not sure what will happen if there is no 0 same info and no all same info pair; then I have to generate one manually? --> but I don't think that is possible as there is only a limited number of sequences possible due to the queries) # select 50 pairs (I'll use those 50 in each condition) # I assume that I WILL have to add some code that ensures that I select the full range of same-info trials from lowest to highest (but I'll do that later) # FUNCTIONS ------------------------------------------------------------------------------------------------------------ # Count number of same/different trials for each df pair def compare_trial_types(df1, df2): same_count = sum(df1['trial_type'] == df2['trial_type']) different_count = sum(df1['trial_type'] != df2['trial_type']) return same_count, different_count # load page sequence files and run compare_trial_types function def load_and_compare_files(file1, file2): # Open/load files df1 = pd.read_csv(file1) df2 = pd.read_csv(file2) # Calc how many same/different trial types same, different = compare_trial_types(df1, df2) # Save number of same trial types new_row = { 'ObsA': os.path.basename(file1), 'ObsB': os.path.basename(file2), 'number_same_trials': same } return new_row # EXECUTION ------------------------------------------------------------------------------------------------------------ # Get a list of all file names directory_path = '01_ParticipantFiles' file_names = [] for i in range(1, 121): file_path = f"{directory_path}/SI_PageSequence_{i}.csv" file_names.append(file_path) # Compare every file with every other file and save how many same info trials they have results = [] for file1, file2 in itertools.combinations(file_names, 2): new_row = load_and_compare_files(file1, file2) results.append(new_row) # Save df_results = pd.DataFrame(results) df_results.to_csv('SI_Exp2_SameInfoTrials.csv') print(df_results) # Now select 50 pairs such that includes the smallest and greatest number of same-info trials plus an even distribution throughout # Sort the results by the number of same trials df_sorted = df_results.sort_values(by='number_same_trials') # Initialize an empty list for selected pairs selected_pairs = [] used_files = set() # select pairs; but this is based on order in the sorted dataframe # Note to self: _ is used in py as a throw away variable (it's the index, which isn't needed) for _, row in df_sorted.iterrows(): if row['ObsA'] not in used_files and row['ObsB'] not in used_files: # add the pair to the selected list selected_pairs.append(row) # mark these files as used used_files.update([row['ObsA'], row['ObsB']]) # break the loop once 50 unique pairs are selected # if len(selected_pairs) >= 50: # break # convert and print df_selected_pairs = pd.DataFrame(selected_pairs) print(df_selected_pairs)