import csv import os import random import pandas as pd from collections import defaultdict from pymongo import MongoClient import numpy as np from tqdm import tqdm from datetime import datetime dt_obj = datetime.strptime('2023-04-13 03:15:41', '%Y-%m-%d %H:%M:%S') data_path = "../../data_collected" filenames = ['oTree_Miami_2022-05-25.xlsx', 'oTree_Miami_2022-05-28.xlsx', 'oTree_Miami_2022-06-04.xlsx', 'oTree_Miami_2022-06-14.xlsx', 'oTree_Miami_2022-06-17.xlsx', 'oTree_Miami_2022-06-21.xlsx', 'oTree_Miami_2022-06-28.xlsx', 'oTree_Miami_2022-07-06.xlsx', 'oTree_Miami_2022-07-23.xlsx', 'oTree_Miami_2022-07-27.xlsx', 'oTree_Miami_2022-07-28.xlsx', 'oTree_Miami_2022-07-30.xlsx', 'oTree_Miami_2022-08-02.xlsx', 'oTree_Miami_2022-08-05.xlsx', 'oTree_Miami_2022-08-10.xlsx', 'oTree_Miami_2022-08-16.xlsx', 'oTree_Miami_2022-08-18.xlsx', 'oTree_Miami_2022-08-19.xlsx', 'oTree_Miami_2022-08-20.xlsx', 'oTree_Miami_2022-08-23.xlsx', 'oTree_Miami_2023-03-25.xlsx', 'oTree_Miami_2023-03-29.xlsx', 'oTree_Miami_2023-04-01.xlsx', 'oTree_Miami_2023-04-04.xlsx', 'oTree_Miami_2023-04-06.xlsx', 'oTree_Miami_2023-04-07.xlsx', 'oTree_Miami_2023-04-08.xlsx', 'oTree_Miami_2023-04-11.xlsx', 'oTree_Miami_2023-04-12.xlsx', 'oTree_Miami_2023-04-13.xlsx', 'oTree_Miami_2023-04-14.xlsx', 'oTree_Miami_2023-04-18.xlsx', 'oTree_Miami_2023-04-19.xlsx', 'oTree_Miami_2023-04-20.xlsx', 'oTree_Miami_2023-04-21.xlsx', 'oTree_Miami_2023-04-23.xlsx', 'oTree_Miami_2023-04-25.xlsx', 'oTree_Miami_2023-04-27.xlsx'] bl = [] bll = [] list_count = defaultdict(int) list_id = defaultdict(list) # with open(os.path.join(data_path, 'Duplicates.csv')) as f: # player_csv = csv.reader(f) # for row in player_csv: # if row[4] == '1': # if row[0] not in bl: # list_count[row[2]] += 1 # list_id[row[2]].append(row[0]) # bl.append(row[0]) accept_ids = [] txt_file = open('accept_ids.txt', 'r') lines = txt_file.readlines() for line in lines: accept_ids.append(line.rstrip('\n')) r = [] for filename in tqdm(filenames[:20]): df = pd.read_excel(os.path.join(data_path, filename), usecols=['participant.label', 'participant.time_started', 'bam122.51.player.AC_Correctness', 'bam122.51.player.csv_file_used']) df_li = df.values.tolist() for label, t, acc, csv_n in df_li: if csv_n != 0: if acc >= 0.5: r.append([label, t, csv_n]) for filename in tqdm(filenames[20:]): df = pd.read_excel(os.path.join(data_path, filename), usecols=['participant.label', 'participant.time_started', 'bam122.51.player.AC_Correctness', 'bam122.51.player.csv_file_used']) df_li = df.values.tolist() for label, t, acc, csv_n in df_li: if csv_n != 0: if acc >= 0.5 or label in accept_ids: r.append([label, t, csv_n]) r.sort(key=lambda x: x[1]) for ri in r: if ri[0] not in bl: list_id[ri[2]].append(ri[0]) list_count[ri[2]] += 1 bl.append(ri[0]) csv_list = [f'list_{str(i + 1)}.csv' for i in range(697)] res = [] for cl in csv_list: res.append([cl, list_count[cl], list_id[cl]]) res.sort(key=lambda x: x[0]) with pd.ExcelWriter('summary_from_data_raw_new.xlsx') as writer: df = pd.DataFrame(res, columns=['list_name', 'count', 'prolific_ids']) df.to_excel(writer)