import glob
import itertools
import numpy as np
import os
import pandas as pd
from pathlib import Path

# TO DOs ---------------------------------------------------------------------------------------------------------------
# need to set one (the same) number of choice-only trials
# create 200 or so sequences
# (--> not sure what will happen if there is no 0 same info and no all same info pair; then I have to generate one manually? --> but I don't think that is possible as there is only a limited number of sequences possible due to the queries)
# select 50 pairs (I'll use those 50 in each condition)
# I assume that I WILL have to add some code that ensures that I select the full range of same-info trials from lowest to highest (but I'll do that later)

# FUNCTIONS ------------------------------------------------------------------------------------------------------------

# Count number of same/different trials for each df pair
def compare_trial_types(df1, df2):
    same_count = sum(df1['trial_type'] == df2['trial_type'])
    different_count = sum(df1['trial_type'] != df2['trial_type'])
    return same_count, different_count

def compare_query_types(df1, df2):
    queries = ['reward_inference_query', 'choice_inference_query', 'reward_prediction_query', 'choice_prediction_query']

    matching_counts = {column: 0 for column in queries}
    # Iterate over each column and count the matches
    for column in queries:
        # Count where both files have a '1' in the same trial
        matching_counts[column] = ((df1[column] == 1) & (df2[column] == 1)).sum()

    return matching_counts #same_RewInf_query, same_ChoInf_query, same_RewPred_query, same_ChoPred_query

# load page sequence files and run compare_trial_types function
def load_and_compare_files(file1, file2):
    # Open/load files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Calc how many same/different trial types
    same, different = compare_trial_types(df1, df2)
    counts_queries = compare_query_types(df1, df2)

    # Save number of same trial types
    new_row = {
        'ObsA': os.path.basename(file1),
        'ObsB': os.path.basename(file2),
        'number_same_trials': same,
        'number_same_RewInf_query':counts_queries['reward_inference_query'],
        'number_same_ChoInf_query':counts_queries['choice_inference_query'],
        'number_same_RewPred_query':counts_queries['reward_prediction_query'],
        'number_same_ChoPred_query':counts_queries['choice_prediction_query'],
        'sum_same_query': counts_queries['reward_inference_query'] + counts_queries['choice_inference_query'] + counts_queries['reward_prediction_query'] + counts_queries['choice_prediction_query']
    }

    return new_row

# EXECUTION ------------------------------------------------------------------------------------------------------------

# Get a list of all file names
directory_path = '01_PageSequences'
file_names = []
fns = glob.glob(directory_path + "/SI_PageSequence_*.csv")
for i in range(1, len(fns)):
    file_path = f"{directory_path}/SI_PageSequence_{i}.csv"
    file_names.append(file_path)

# Compare every file with every other file and save how many same info trials they have
results = []
for file1, file2 in itertools.combinations(file_names, 2):
    new_row = load_and_compare_files(file1, file2)
    results.append(new_row)

df_results = pd.DataFrame(results)
#df_results.to_csv('SI_Exp2_SameInfoTrials.csv')
#print(df_results)

# Now select 25 pairs such that includes the smallest and greatest number of same-info trials plus an even distribution throughout
# Sort the results by the number of same trials
#df_sorted = df_results.sort_values(by='number_same_trials')
df_sorted = df_results.sort_values(by=['number_same_trials', 'sum_same_query'], ascending=[True, True])
df_sorted.to_csv('SI_Exp2_SameInfoSameQueryTrialsSorted.csv')
#df_sorted.to_csv('selectuniformFromThis.csv')

# Need to get a count of how many rows I have per number_same_info_trials
unique_number_same_query = df_sorted['sum_same_query'].unique().tolist()
for each in unique_number_same_query:
    df_interim = df_sorted[df_sorted['sum_same_query'] == each]
    counts_per_number_same_info_trial = df_interim['number_same_trials'].value_counts().sort_index()
    print("Same query number", each)
    print(counts_per_number_same_info_trial)

# Above hae a look and decide which sum_same_query gives you the widest range of number_same_trials
# visual inspection gives 4-5 sum_same_query gives widest range.
# select sum_same_query == 5
df_subset = df_sorted[df_sorted['sum_same_query'] == 5]
# count how many rows per number_same_trials
counts_per_number_same_info_trial = df_subset['number_same_trials'].value_counts()
#counts_sorted = counts_per_number_same_info_trial.sort_values()
df_subset['count_number_same_trials'] = df_subset['number_same_trials'].map(counts_per_number_same_info_trial)
# sort ascending
df_subset.sort_values(by=['count_number_same_trials'], ascending=[True])
# select 50/range, i.e. have to calculate range first
range = df_subset['number_same_trials'].max() - df_subset['number_same_trials'].min()
selectHowMany = np.ceil(25/range) # rounding up
print('selectHowMany', selectHowMany)

# This selects the first three rows of every group of unique number_same_trials
#selected_rows = df_subset.groupby('number_same_trials').head(3)

# Initialize an empty dictionary to store counts for each unique 'number_same_trials' value
counts = {}
#selected_rows = []
selected_rows = pd.DataFrame()
filenames = []
# Track used ObsA and ObsB to ensure uniqueness
used_files = set()
ObsA_used_files = set()
ObsB_used_files = set()

# Loop through each row in the DataFrame
for index, row in df_subset.iterrows():
    # Get the current 'number_same_trials' value
    current_number = row['number_same_trials']

    # Check if this value has been encountered less than three times
    if counts.get(current_number, 0) < selectHowMany:
        # Make sure to save the file names
        if row['ObsA'] not in used_files and row['ObsB'] not in used_files:
            used_files.add(row['ObsA'])
            used_files.add(row['ObsB'])
            # Add the row or specific values to your selected rows list
            #selected_rows.append(row)
            selected_rows = pd.concat([selected_rows, pd.DataFrame([row])], ignore_index=True)

            # Update the count for 'number_same_trials' value
            counts[current_number] = counts.get(current_number, 0) + 1

# Reset index for the selected rows dataframe
selected_rows.reset_index(drop=True, inplace=True)
print('selected_rows', selected_rows)
selected_rows.to_csv('SI_Exp2_SelectedFiles.csv')
num_rows = len(selected_rows)
num_rows_to_delete = num_rows - 25

# In order to get to 50 I still need to delete 5
indices_to_delete = np.linspace(0, len(selected_rows) - 1, num=num_rows_to_delete, endpoint=False).astype(int)
selected_rows_after_deletion = selected_rows.drop(selected_rows.index[indices_to_delete])
# Reset index after deletion
selected_rows_after_deletion.reset_index(drop=True, inplace=True)

#print('indices_to_delete', indices_to_delete)
selected_rows_after_deletion.to_csv('SI_Exp2_50SameInfoTrials.csv')

# Now I need to rename the files such that I have 50 sequences for each participant
# but how do I handle Obs B? for algo partner I can have them in a separate folder.
# but for two participants? Maybe one folder called ObsA and one ObsB, but that's not true because for algo partner I need 50 and for two participants I need 25
# so I guess I save them in one folder and then do +1 or so for two participants
# Specify the folder path where files are located
#folder_path = Path('path_to_your_folder')
folder_path = Path('01_PageSequences')
new_folder_path = Path('02_ParticipantFiles')
counter = 1
# Iterate over the DataFrame rows
for index, row in selected_rows_after_deletion.iterrows():
    ObsA_file = row['ObsA']
    ObsB_file = row['ObsB']
    same_count = row['number_same_trials']
    print('ObsA_file', ObsA_file)

    # Construct the full path for ObsA and ObsB files
    ObsA_path = folder_path / ObsA_file
    ObsB_path = folder_path / ObsB_file


    # Check if the ObsA file exists and rename it
    if ObsA_path.exists():
        df_interim = pd.read_csv(ObsA_path)
        ObsA_new_name = new_folder_path / f'SI_PageSequence_{counter}.csv'
        df_interim['number_same_trials'] = same_count
        df_interim['ObserverB'] = counter + 1
        df_interim.to_csv(ObsA_new_name, index=False)
        #ObsA_new_name = '02_ParticipantFiles/
        # Remember path includes old file name
        #with ObsA_path.open('r') as source_file:
        #    contents = source_file.read()
        #with ObsA_new_name.open('w') as new_file:
        #    new_file.write(contents)

    if ObsB_path.exists():
        df_interim = pd.read_csv(ObsB_path)
        df_interim['number_same_trials'] = same_count
        df_interim['ObserverA'] = counter
        ObsB_counter = counter + 1
        ObsB_new_name = new_folder_path / f'SI_PageSequence_{ObsB_counter}.csv'
        df_interim.to_csv(ObsB_new_name, index=False)
        #ObsA_new_name = '02_ParticipantFiles/
        #with ObsB_path.open('r') as source_file:
        #    contents = source_file.read()
        #with ObsB_new_name.open('w') as new_file:
        #    new_file.write(contents)

        counter += 2  # Increment the counter after each file (+2 because 1 for Obs B and 1 for new row)



















"""
# Iterate over each unique value and its count
for value, count in counts_sorted.iteritems():
    # Calculate how many rows to select for this value, based on 'rows_per_value'
    # and ensure I don't select more rows than exist for this value
    #num_rows_to_select = min(int(selectHowMany), count)

    # Filter the original DataFrame for rows matching this value and select up to 'num_rows_to_select' rows
    rows_for_value = df[df['column_name'] == value].head(selectHowMany)

    # Append these rows to 'selected_rows'
    selected_rows = pd.concat([selected_rows, rows_for_value], ignore_index=True)

    # If 'selected_rows' reaches or exceeds 50, break the loop
    if len(selected_rows) >= 50:
        break
        
# Reset index for the selected rows dataframe
selected_rows.reset_index(drop=True, inplace=True)

# Filter the dataframe for 'number_same_trials' between 7 and 25 (I ignored the one 26 that was spit out)
filtered_data = df_sorted[df_sorted['number_same_trials'].between(7, 25)]

# Initialize an empty dataframe to store the selected rows
selected_rows = pd.DataFrame()
# Track used ObsA and ObsB to ensure uniqueness
used_obs_a = set()
used_obs_b = set()

# Iterate over the range 7 to 25 to select 3 files for each number, ensuring unique ObsA and ObsB
for number in range(7, 27):  # Include 25
    # Filter for the current 'number_same_trials'
    temp_data = filtered_data[filtered_data['number_same_trials'] == number]


    # Counter for selected files for the current number
    selected_count = 0

    for index, row in temp_data.iterrows():
        # Check if ObsA and ObsB are unique and select the row if they are
        if row['ObsA'] not in used_obs_a and row['ObsB'] not in used_obs_b and selected_count < 2:
            # Add to selected rows
            selected_rows = pd.concat([selected_rows, pd.DataFrame([row])], ignore_index=True)

            # Mark ObsA and ObsB as used
            used_obs_a.add(row['ObsA'])
            used_obs_b.add(row['ObsB'])

            # Increment the counter
            selected_count += 1

# Reset index for the selected rows dataframe
selected_rows.reset_index(drop=True, inplace=True)
"""

"""
# Initialize an empty list for selected pairs
selected_pairs = []
used_files = set()

# select pairs; but this is based on order in the sorted dataframe
# Note to self: _ is used in py as a throw away variable (it's the index, which isn't needed)
for _, row in df_sorted.iterrows():
    if row['ObsA'] not in used_files and row['ObsB'] not in used_files:
        # add the pair to the selected list
        selected_pairs.append(row)
        # mark these files as used
        used_files.update([row['ObsA'], row['ObsB']])
    # break the loop once 50 unique pairs are selected
    # if len(selected_pairs) >= 50:
    #    break

# convert and print
df_selected_pairs = pd.DataFrame(selected_pairs)
print(df_selected_pairs)
"""