import glob import itertools import numpy as np import os import pandas as pd from pathlib import Path # TO DOs --------------------------------------------------------------------------------------------------------------- # need to set one (the same) number of choice-only trials # create 200 or so sequences # (--> not sure what will happen if there is no 0 same info and no all same info pair; then I have to generate one manually? --> but I don't think that is possible as there is only a limited number of sequences possible due to the queries) # select 50 pairs (I'll use those 50 in each condition) # I assume that I WILL have to add some code that ensures that I select the full range of same-info trials from lowest to highest (but I'll do that later) # FUNCTIONS ------------------------------------------------------------------------------------------------------------ # Count number of same/different trials for each df pair def compare_trial_types(df1, df2): same_count = sum(df1['trial_type'] == df2['trial_type']) different_count = sum(df1['trial_type'] != df2['trial_type']) return same_count, different_count def compare_query_types(df1, df2): queries = ['reward_inference_query', 'choice_inference_query', 'reward_prediction_query', 'choice_prediction_query'] matching_counts = {column: 0 for column in queries} # Iterate over each column and count the matches for column in queries: # Count where both files have a '1' in the same trial matching_counts[column] = ((df1[column] == 1) & (df2[column] == 1)).sum() return matching_counts #same_RewInf_query, same_ChoInf_query, same_RewPred_query, same_ChoPred_query # load page sequence files and run compare_trial_types function def load_and_compare_files(file1, file2): # Open/load files df1 = pd.read_csv(file1) df2 = pd.read_csv(file2) # Calc how many same/different trial types same, different = compare_trial_types(df1, df2) counts_queries = compare_query_types(df1, df2) # Save number of same trial types new_row = { 'ObsA': os.path.basename(file1), 'ObsB': os.path.basename(file2), 'number_same_trials': same, 'number_same_RewInf_query':counts_queries['reward_inference_query'], 'number_same_ChoInf_query':counts_queries['choice_inference_query'], 'number_same_RewPred_query':counts_queries['reward_prediction_query'], 'number_same_ChoPred_query':counts_queries['choice_prediction_query'], 'sum_same_query': counts_queries['reward_inference_query'] + counts_queries['choice_inference_query'] + counts_queries['reward_prediction_query'] + counts_queries['choice_prediction_query'] } return new_row # EXECUTION ------------------------------------------------------------------------------------------------------------ # Get a list of all file names directory_path = '01_PageSequences' file_names = [] fns = glob.glob(directory_path + "/SI_PageSequence_*.csv") for i in range(1, len(fns)): file_path = f"{directory_path}/SI_PageSequence_{i}.csv" file_names.append(file_path) # Compare every file with every other file and save how many same info trials they have results = [] for file1, file2 in itertools.combinations(file_names, 2): new_row = load_and_compare_files(file1, file2) results.append(new_row) df_results = pd.DataFrame(results) #df_results.to_csv('SI_Exp2_SameInfoTrials.csv') #print(df_results) # Now select 25 pairs such that includes the smallest and greatest number of same-info trials plus an even distribution throughout # Sort the results by the number of same trials #df_sorted = df_results.sort_values(by='number_same_trials') df_sorted = df_results.sort_values(by=['number_same_trials', 'sum_same_query'], ascending=[True, True]) df_sorted.to_csv('SI_Exp2_SameInfoSameQueryTrialsSorted.csv') #df_sorted.to_csv('selectuniformFromThis.csv') # Need to get a count of how many rows I have per number_same_info_trials unique_number_same_query = df_sorted['sum_same_query'].unique().tolist() for each in unique_number_same_query: df_interim = df_sorted[df_sorted['sum_same_query'] == each] counts_per_number_same_info_trial = df_interim['number_same_trials'].value_counts().sort_index() print("Same query number", each) print(counts_per_number_same_info_trial) # Above hae a look and decide which sum_same_query gives you the widest range of number_same_trials # visual inspection gives 4-5 sum_same_query gives widest range. # select sum_same_query == 5 df_subset = df_sorted[df_sorted['sum_same_query'] == 5] # count how many rows per number_same_trials counts_per_number_same_info_trial = df_subset['number_same_trials'].value_counts() #counts_sorted = counts_per_number_same_info_trial.sort_values() df_subset['count_number_same_trials'] = df_subset['number_same_trials'].map(counts_per_number_same_info_trial) # sort ascending df_subset.sort_values(by=['count_number_same_trials'], ascending=[True]) # select 50/range, i.e. have to calculate range first range = df_subset['number_same_trials'].max() - df_subset['number_same_trials'].min() selectHowMany = np.ceil(25/range) # rounding up print('selectHowMany', selectHowMany) # This selects the first three rows of every group of unique number_same_trials #selected_rows = df_subset.groupby('number_same_trials').head(3) # Initialize an empty dictionary to store counts for each unique 'number_same_trials' value counts = {} #selected_rows = [] selected_rows = pd.DataFrame() filenames = [] # Track used ObsA and ObsB to ensure uniqueness used_files = set() ObsA_used_files = set() ObsB_used_files = set() # Loop through each row in the DataFrame for index, row in df_subset.iterrows(): # Get the current 'number_same_trials' value current_number = row['number_same_trials'] # Check if this value has been encountered less than three times if counts.get(current_number, 0) < selectHowMany: # Make sure to save the file names if row['ObsA'] not in used_files and row['ObsB'] not in used_files: used_files.add(row['ObsA']) used_files.add(row['ObsB']) # Add the row or specific values to your selected rows list #selected_rows.append(row) selected_rows = pd.concat([selected_rows, pd.DataFrame([row])], ignore_index=True) # Update the count for 'number_same_trials' value counts[current_number] = counts.get(current_number, 0) + 1 # Reset index for the selected rows dataframe selected_rows.reset_index(drop=True, inplace=True) print('selected_rows', selected_rows) selected_rows.to_csv('SI_Exp2_SelectedFiles.csv') num_rows = len(selected_rows) num_rows_to_delete = num_rows - 25 # In order to get to 50 I still need to delete 5 indices_to_delete = np.linspace(0, len(selected_rows) - 1, num=num_rows_to_delete, endpoint=False).astype(int) selected_rows_after_deletion = selected_rows.drop(selected_rows.index[indices_to_delete]) # Reset index after deletion selected_rows_after_deletion.reset_index(drop=True, inplace=True) #print('indices_to_delete', indices_to_delete) selected_rows_after_deletion.to_csv('SI_Exp2_50SameInfoTrials.csv') # Now I need to rename the files such that I have 50 sequences for each participant # but how do I handle Obs B? for algo partner I can have them in a separate folder. # but for two participants? Maybe one folder called ObsA and one ObsB, but that's not true because for algo partner I need 50 and for two participants I need 25 # so I guess I save them in one folder and then do +1 or so for two participants # Specify the folder path where files are located #folder_path = Path('path_to_your_folder') folder_path = Path('01_PageSequences') new_folder_path = Path('02_ParticipantFiles') counter = 1 # Iterate over the DataFrame rows for index, row in selected_rows_after_deletion.iterrows(): ObsA_file = row['ObsA'] ObsB_file = row['ObsB'] same_count = row['number_same_trials'] print('ObsA_file', ObsA_file) # Construct the full path for ObsA and ObsB files ObsA_path = folder_path / ObsA_file ObsB_path = folder_path / ObsB_file # Check if the ObsA file exists and rename it if ObsA_path.exists(): df_interim = pd.read_csv(ObsA_path) ObsA_new_name = new_folder_path / f'SI_PageSequence_{counter}.csv' df_interim['number_same_trials'] = same_count df_interim['ObserverB'] = counter + 1 df_interim.to_csv(ObsA_new_name, index=False) #ObsA_new_name = '02_ParticipantFiles/ # Remember path includes old file name #with ObsA_path.open('r') as source_file: # contents = source_file.read() #with ObsA_new_name.open('w') as new_file: # new_file.write(contents) if ObsB_path.exists(): df_interim = pd.read_csv(ObsB_path) df_interim['number_same_trials'] = same_count df_interim['ObserverA'] = counter ObsB_counter = counter + 1 ObsB_new_name = new_folder_path / f'SI_PageSequence_{ObsB_counter}.csv' df_interim.to_csv(ObsB_new_name, index=False) #ObsA_new_name = '02_ParticipantFiles/ #with ObsB_path.open('r') as source_file: # contents = source_file.read() #with ObsB_new_name.open('w') as new_file: # new_file.write(contents) counter += 2 # Increment the counter after each file (+2 because 1 for Obs B and 1 for new row) """ # Iterate over each unique value and its count for value, count in counts_sorted.iteritems(): # Calculate how many rows to select for this value, based on 'rows_per_value' # and ensure I don't select more rows than exist for this value #num_rows_to_select = min(int(selectHowMany), count) # Filter the original DataFrame for rows matching this value and select up to 'num_rows_to_select' rows rows_for_value = df[df['column_name'] == value].head(selectHowMany) # Append these rows to 'selected_rows' selected_rows = pd.concat([selected_rows, rows_for_value], ignore_index=True) # If 'selected_rows' reaches or exceeds 50, break the loop if len(selected_rows) >= 50: break # Reset index for the selected rows dataframe selected_rows.reset_index(drop=True, inplace=True) # Filter the dataframe for 'number_same_trials' between 7 and 25 (I ignored the one 26 that was spit out) filtered_data = df_sorted[df_sorted['number_same_trials'].between(7, 25)] # Initialize an empty dataframe to store the selected rows selected_rows = pd.DataFrame() # Track used ObsA and ObsB to ensure uniqueness used_obs_a = set() used_obs_b = set() # Iterate over the range 7 to 25 to select 3 files for each number, ensuring unique ObsA and ObsB for number in range(7, 27): # Include 25 # Filter for the current 'number_same_trials' temp_data = filtered_data[filtered_data['number_same_trials'] == number] # Counter for selected files for the current number selected_count = 0 for index, row in temp_data.iterrows(): # Check if ObsA and ObsB are unique and select the row if they are if row['ObsA'] not in used_obs_a and row['ObsB'] not in used_obs_b and selected_count < 2: # Add to selected rows selected_rows = pd.concat([selected_rows, pd.DataFrame([row])], ignore_index=True) # Mark ObsA and ObsB as used used_obs_a.add(row['ObsA']) used_obs_b.add(row['ObsB']) # Increment the counter selected_count += 1 # Reset index for the selected rows dataframe selected_rows.reset_index(drop=True, inplace=True) """ """ # Initialize an empty list for selected pairs selected_pairs = [] used_files = set() # select pairs; but this is based on order in the sorted dataframe # Note to self: _ is used in py as a throw away variable (it's the index, which isn't needed) for _, row in df_sorted.iterrows(): if row['ObsA'] not in used_files and row['ObsB'] not in used_files: # add the pair to the selected list selected_pairs.append(row) # mark these files as used used_files.update([row['ObsA'], row['ObsB']]) # break the loop once 50 unique pairs are selected # if len(selected_pairs) >= 50: # break # convert and print df_selected_pairs = pd.DataFrame(selected_pairs) print(df_selected_pairs) """