import csv import os import random import pandas as pd from collections import defaultdict from pymongo import MongoClient import numpy as np from tqdm import tqdm # client = MongoClient('mongodb+srv://miami:miami6033626@cluster0.lodffed.mongodb.net/?retryWrites=true&w=majority') # db = client['miami-database'] # csv_collection = db['csv'] # record_collection = db['record'] # csv_list = [f'list_{str(i + 1)}.csv' for i in range(697)] # # query0 = record_collection.count_documents({"count": {'$eq': 0}}) # query1 = record_collection.count_documents({"count": {'$eq': 1}}) # query2 = record_collection.count_documents({"count": {'$eq': 2}}) # query3 = record_collection.count_documents({"count": {'$eq': 3}}) # query4 = record_collection.count_documents({"count": {'$eq': 4}}) # query5 = record_collection.count_documents({"count": {'$eq': 5}}) # print(query0 * 6 + query1 * 5 + query2 * 4 + query3 * 3 + query4 * 2 + query5 * 1) # with open('../bam122/a.txt', 'a', encoding='utf-8') as file: # for x in record_collection.find({"count": {'$lt': 6}}): # file.write(x['name'] + '\n') # res = [] # for x in record_collection.find({}): # res.append([x['name'], x['count'], x['prolific_ids']]) # # with pd.ExcelWriter('summary_from_mongodb.xlsx') as writer: # df = pd.DataFrame(res, columns=['list_name', 'count', 'prolific_ids']) # df.to_excel(writer) # # data_path = "../../data_collected" # filenames = ['oTree_Miami_2023-03-25.xlsx', 'oTree_Miami_2023-03-29.xlsx', 'oTree_Miami_2023-04-01.xlsx', # 'oTree_Miami_2023-04-04.xlsx', 'oTree_Miami_2023-04-06.xlsx', 'oTree_Miami_2023-04-07.xlsx', # 'oTree_Miami_2023-04-08.xlsx', 'oTree_Miami_2023-04-11.xlsx', 'oTree_Miami_2023-04-12.xlsx', # 'oTree_Miami_2023-04-13.xlsx', 'oTree_Miami_2023-04-14.xlsx', 'oTree_Miami_2023-04-18.xlsx', # 'oTree_Miami_2023-04-19.xlsx', 'oTree_Miami_2023-04-20.xlsx', 'oTree_Miami_2023-04-21.xlsx', # 'oTree_Miami_2023-04-23.xlsx', 'oTree_Miami_2023-04-25.xlsx'] # # bl = [] # bll = [] # list_count = defaultdict(int) # list_id = defaultdict(list) # with open(os.path.join(data_path, 'Duplicates.csv')) as f: # player_csv = csv.reader(f) # for row in player_csv: # if row[4] == '1': # if row[0] not in bl: # list_count[row[2]] += 1 # list_id[row[2]].append(row[0]) # bl.append(row[0]) # # accept_ids = [] # txt_file = open('accept_ids.txt', 'r') # lines = txt_file.readlines() # for line in lines: # accept_ids.append(line.rstrip('\n')) # ac_ids = [] # for filename in tqdm(filenames): # df = pd.read_excel(os.path.join(data_path, filename), # usecols=['participant.label', 'bam122.51.player.AC_Correctness', # 'bam122.51.player.csv_file_used']) # df_li = df.values.tolist() # for label, acc, csv_n in df_li: # # if csv_n == 'list_453.csv': # # print('find_one!') # if acc >= 0.5 and csv_n != 0: # if label not in bl: # list_id[csv_n].append(label) # list_count[csv_n] += 1 # bl.append(label) # else: # if label in accept_ids and csv_n != 0: # # ac_ids.append([label, csv_n, filename]) # if label not in bl: # list_id[csv_n].append(label) # list_count[csv_n] += 1 # bl.append(label) # # # v = list(set(bl)) # # # csv_list = [f'list_{str(i + 1)}.csv' for i in range(697)] # res = [] # for cl in csv_list: # res.append([cl, list_count[cl], list_id[cl]]) # # with pd.ExcelWriter('summary_from_data1.xlsx') as writer: # df = pd.DataFrame(res, columns=['list_name', 'count', 'prolific_ids']) # df.to_excel(writer) # with pd.ExcelWriter('summary_for_accepted_ids2.xlsx') as writer: # df = pd.DataFrame(ac_ids, columns=['ids', 'csv_used', 'filename']) # df.to_excel(writer) csv_list = [f'list_{str(i + 1)}.csv' for i in range(697)] total_urls = dict() for csv_name in csv_list: df = pd.read_csv(csv_name, header=None) ll = df.iloc[:, 0].tolist() ids = [] for lli in ll: ids.append(lli.split('/')[-1].rstrip('.jpg')) total_urls[csv_name] = ids res = set() with open('./2.txt', 'r', encoding='utf-8') as file: for line in tqdm(file): tmp = line.strip('\n').split('/')[-1].rstrip('.jpg') for csv_name in csv_list: if tmp in total_urls[csv_name]: res.add(csv_name) print(res)