import itertools
import os
import pandas as pd

# TO DOs ---------------------------------------------------------------------------------------------------------------
# need to set one (the same) number of choice-only trials
# create 200 or so sequences
# (--> not sure what will happen if there is no 0 same info and no all same info pair; then I have to generate one manually? --> but I don't think that is possible as there is only a limited number of sequences possible due to the queries)
# select 50 pairs (I'll use those 50 in each condition)
# I assume that I WILL have to add some code that ensures that I select the full range of same-info trials from lowest to highest (but I'll do that later)

# FUNCTIONS ------------------------------------------------------------------------------------------------------------

# Count number of same/different trials for each df pair
def compare_trial_types(df1, df2):
    same_count = sum(df1['trial_type'] == df2['trial_type'])
    different_count = sum(df1['trial_type'] != df2['trial_type'])
    return same_count, different_count

# load page sequence files and run compare_trial_types function
def load_and_compare_files(file1, file2):
    # Open/load files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Calc how many same/different trial types
    same, different = compare_trial_types(df1, df2)

    # Save number of same trial types
    new_row = {
        'ObsA': os.path.basename(file1),
        'ObsB': os.path.basename(file2),
        'number_same_trials': same
    }

    return new_row

# EXECUTION ------------------------------------------------------------------------------------------------------------

# Get a list of all file names
directory_path = '01_ParticipantFiles'
file_names = []
for i in range(1, 121):
    file_path = f"{directory_path}/SI_PageSequence_{i}.csv"
    file_names.append(file_path)

# Compare every file with every other file and save how many same info trials they have
results = []
for file1, file2 in itertools.combinations(file_names, 2):
    new_row = load_and_compare_files(file1, file2)
    results.append(new_row)

# Save
df_results = pd.DataFrame(results)
df_results.to_csv('SI_Exp2_SameInfoTrials.csv')
print(df_results)

# Now select 50 pairs such that includes the smallest and greatest number of same-info trials plus an even distribution throughout
# Sort the results by the number of same trials
df_sorted = df_results.sort_values(by='number_same_trials')

# Initialize an empty list for selected pairs
selected_pairs = []
used_files = set()

# select pairs; but this is based on order in the sorted dataframe
# Note to self: _ is used in py as a throw away variable (it's the index, which isn't needed)
for _, row in df_sorted.iterrows():
    if row['ObsA'] not in used_files and row['ObsB'] not in used_files:
        # add the pair to the selected list
        selected_pairs.append(row)
        # mark these files as used
        used_files.update([row['ObsA'], row['ObsB']])
    # break the loop once 50 unique pairs are selected
    # if len(selected_pairs) >= 50:
    #    break

# convert and print
df_selected_pairs = pd.DataFrame(selected_pairs)
print(df_selected_pairs)