import pandas as pd
import numpy as np
import os
from sklearn import linear_model
from sklearn.metrics import log_loss
results_file='../raw_inputs/anon_application_results.csv'
barrons_cases='../raw_inputs/odds_cases-barrons.csv'
barrons_output_file='../inputs/anon_barrons_coefs.csv'
schools_output_file='../inputs/anon_schools_coefs.csv'
os.chdir('../inputs')
# First we're going to read the admissions results
df = pd.read_csv(results_file,encoding='cp1252')
df.head()
print(len(df))
df.hs_class.value_counts()
#First, we're going to analyze this data based on Barrons classes
b_case_df = pd.read_csv(barrons_cases,encoding='cp1252')
b_case_df.head()
print(len(b_case_df))
b_case_df.RACE.value_counts()
b_case_df.hs_class.describe()
# We'll iterate over different cases, save them to a LoL for saving out
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
'50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
# Here is a function used to do the regression analyses
def run_lregression(data):
'''Returns the logistic regression results for the passed numpy array
where the first columns are the independent variables and the final
column is the outcome (Y)'''
lr = linear_model.LogisticRegression(C=10000000000, solver='newton-cg')
X = data[:,:-1]
Y = data[:,-1]
lr.fit(X, Y)
GPAcoef = lr.coef_[0][0]
ACTcoef = lr.coef_[0][1]
intercept = lr.intercept_[0]
score = lr.score(X,Y)
loss = log_loss(Y, lr.predict_proba(X))
# now create some sensitivity stats
# first find the average gpa of points near 50/50
preds = lr.predict_proba(X)
gpa_yes = []
for i in range(len(preds)):
if (preds[i][0] > 0.35) and (preds[i][0] < 0.65):
gpa_yes.append(X[i,0])
# then calculate the ACT that corresponds to this average
avg_yes_gpa = np.mean(gpa_yes)
avg_act_yes = (-intercept - avg_yes_gpa*GPAcoef)/ACTcoef
# next, build a sensitivity matrix and check the predictions
X_check = np.array([[avg_yes_gpa, avg_act_yes],
[avg_yes_gpa+0.05, avg_act_yes+.5],
[avg_yes_gpa+0.1, avg_act_yes+1]])
pred_check = lr.predict_proba(X_check)
return [Y.sum(), GPAcoef, ACTcoef, intercept, score, loss,
avg_yes_gpa, avg_act_yes, pred_check[0][1],pred_check[1][1],
pred_check[2][1]]
# We'll now iterate over each of the barrons cases for analysis.
# We skip the ones where the title begins with a #
for i, case in b_case_df.iterrows():
if case.Title[0] != '#':
if case.hs_class == 'ALL':
this_df = df[(df.RACE == case.RACE)&(df.Barrons == case.Barrons)]
else:
this_df = df[(df.RACE == case.RACE)&(df.Barrons == case.Barrons)&(df.hs_class == 2018)]
# Now get the right array to send to the analysis function:
act_var = 'ACT50' if case.RACE == 'W/A' else 'ACT25'
short_df = this_df[['GPA','ACT','Y',act_var]]
short_df = short_df[pd.notnull(short_df[act_var])]
short_df['ACT'] = short_df['ACT']-short_df[act_var]
trial_data = short_df[['GPA','ACT','Y']].values
# Now complete the regression and append the result to the output table
if len(trial_data) > 1:
print('%s: %d' % (case.Title, len(this_df)),end='')
try:
reg_response = run_lregression(trial_data)
new_row = [case.Title, len(this_df)]
new_row.extend(reg_response)
output_table.append(new_row)
print('...Works!')
except Exception as e:
print('...No dice!')
#raise e
barrons_output = pd.DataFrame(output_table[1:],columns=output_table[0])
barrons_output
barrons_output.to_csv(barrons_output_file,index=False)
analysis_list=[['Label','NCES','RACE','hs_class']]
current_year = 2018
picker_df = df[['NCES','RACE','Y','hs_class','ACT25','type']]
for race in ['H', 'AA', 'W/A']:
this_df = picker_df[picker_df.RACE==race]
nces_vals = list(set(this_df.NCES))
for nces in nces_vals:
nces_df = this_df[this_df.NCES == nces]
ty_yes = sum((nces_df.Y == 1) & (nces_df.hs_class == current_year))
ty_no = sum((nces_df.Y == 0) & (nces_df.hs_class == current_year))
ay_yes = sum(nces_df.Y == 1)
ay_no = sum(nces_df.Y ==0)
act25 = this_df.ACT25.iloc[0]
if (ty_yes >= 10) & (ty_no >= 10):
analysis_list.append([race+':'+str(nces), nces, race, current_year])
elif ((ay_yes >= 10)&(ay_no >=10))|(np.isnan(act25)&(ay_yes >=5)&(ay_no >= 5)):
analysis_list.append([race+':'+str(nces), nces, race, 'ALL'])
len(analysis_list)
anl_short=analysis_list[:5]
anl_short
analysis_list[-5:]
# Now that we have a full list for analysis, we can run regressions for each
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
'50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
for case, nces, race, hs_class in analysis_list[1:]:
if hs_class == 'ALL':
this_df = df[(df.RACE == race)&(df.NCES == nces)]
else:
this_df = df[(df.RACE == race)&(df.NCES == nces)&(df.hs_class == hs_class)]
# Now get the right array to send to the analysis function:
trial_data = this_df[['GPA','ACT','Y']].values
# Now complete the regression and append the result to the output table
if len(trial_data) > 1:
print('%s: %d' % (case, len(this_df)),end='')
try:
reg_response = run_lregression(trial_data)
new_row = [case, len(this_df)]
new_row.extend(reg_response)
output_table.append(new_row)
print('...Works!')
except Exception as e:
print('...No dice!')
#raise e
len(output_table)
schools_output = pd.DataFrame(output_table[1:],columns=output_table[0])
schools_output.head()
analysis_df = pd.DataFrame(analysis_list[1:],columns=analysis_list[0])
schools_output.to_csv(schools_output_file,index=False)
analysis_df.to_csv('anon_school_analysis_list.csv',index=False)
# Redo of the analysis (this was created by flagging cases
# in the file saved previously:
a_list= [
['H:144281','144281','H','ALL'],
['H:170532','170532','H','ALL'],
['H:Arrupe','Arrupe','H','ALL'],
['H:145691','145691','H','ALL'],
['AA:216597','216597','AA','ALL'],
['AA:178396','178396','AA','ALL'],
['AA:170532','170532','AA','ALL'],
['AA:Arrupe','Arrupe','AA','ALL'],
['AA:145691','145691','AA','ALL'],
['AA:170082','170082','AA','ALL'],
['AA:144740','144740','AA','ALL'],
]
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
'50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
for case, nces, race, hs_class in a_list[:]:
if hs_class == 'ALL':
this_df = df[(df.RACE == race)&(df.NCES == nces)]
else:
this_df = df[(df.RACE == race)&(df.NCES == nces)&(df.hs_class == hs_class)]
# Now get the right array to send to the analysis function:
trial_data = this_df[['GPA','ACT','Y']].values
# Now complete the regression and append the result to the output table
if len(trial_data) > 1:
print('%s: %d' % (case, len(this_df)),end='')
try:
reg_response = run_lregression(trial_data)
new_row = [case, len(this_df)]
new_row.extend(reg_response)
output_table.append(new_row)
print('...Works!')
except Exception as e:
print('...No dice!')
#raise e
schools_output2 = pd.DataFrame(output_table[1:],columns=output_table[0])
schools_output2
schools_output2.to_csv('anon_schools_coefs_take_2.csv',index=False)