Calculate odds coefficients

This file is also a work in progress--a partial migration of other code

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import linear_model
from sklearn.metrics import log_loss
In [12]:
results_file='../raw_inputs/anon_application_results.csv'
barrons_cases='../raw_inputs/odds_cases-barrons.csv'
barrons_output_file='../inputs/anon_barrons_coefs.csv'
schools_output_file='../inputs/anon_schools_coefs.csv'
In [3]:
os.chdir('../inputs')
In [4]:
# First we're going to read the admissions results
df = pd.read_csv(results_file,encoding='cp1252')
df.head()
Out[4]:
NCES hs_student_id hs_class type result_code Campus RACE GPA ACT Y collegename Barrons ACT25 ACT50
0 148876 921691211 2018 Regular Accepted Campus1 H 2.80 13 1 St. Augustine College Noncompetitive NaN NaN
1 148654 1599951305 2018 Regular Accepted Campus1 H 2.50 19 1 University of Illinois at Springfield Competitive 20.0 23.0
2 148654 479129093 2018 Regular Accepted Campus1 H 2.70 20 1 University of Illinois at Springfield Competitive 20.0 23.0
3 148654 1178256154 2018 Regular Accepted Campus1 AA 2.24 22 1 University of Illinois at Springfield Competitive 20.0 23.0
4 148654 1903150611 2018 Regular Accepted Campus1 H 2.16 22 1 University of Illinois at Springfield Competitive 20.0 23.0
In [5]:
print(len(df))
df.hs_class.value_counts()
68757
Out[5]:
2018    19446
2017    16947
2016    16566
2015    15798
Name: hs_class, dtype: int64
In [6]:
#First, we're going to analyze this data based on Barrons classes
b_case_df = pd.read_csv(barrons_cases,encoding='cp1252')
b_case_df.head()
Out[6]:
Title Barrons RACE hs_class
0 H:Most Competitive+ Most Competitive+ H 2018
1 H:Most Competitive Most Competitive H 2018
2 H:Highly Competitive Highly Competitive H 2018
3 H:Very Competitive Very Competitive H 2018
4 H:Competitive Competitive H 2018
In [7]:
print(len(b_case_df))
b_case_df.RACE.value_counts()
30
Out[7]:
AA     10
W/A    10
H      10
Name: RACE, dtype: int64
In [8]:
b_case_df.hs_class.describe()
Out[8]:
count       30
unique       2
top       2018
freq        20
Name: hs_class, dtype: object
In [9]:
# We'll iterate over different cases, save them to a LoL for saving out
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
               '50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
In [10]:
# Here is a function used to do the regression analyses
def run_lregression(data):
    '''Returns the logistic regression results for the passed numpy array
    where the first columns are the independent variables and the final
    column is the outcome (Y)'''
    lr = linear_model.LogisticRegression(C=10000000000, solver='newton-cg')
    X = data[:,:-1]
    Y = data[:,-1]
    lr.fit(X, Y)
    GPAcoef = lr.coef_[0][0]
    ACTcoef = lr.coef_[0][1]
    intercept = lr.intercept_[0]
    score = lr.score(X,Y)
    loss = log_loss(Y, lr.predict_proba(X))
    # now create some sensitivity stats
    # first find the average gpa of points near 50/50
    preds = lr.predict_proba(X)
    gpa_yes = []
    for i in range(len(preds)):
        if (preds[i][0] > 0.35) and (preds[i][0] < 0.65):
            gpa_yes.append(X[i,0])

    # then calculate the ACT that corresponds to this average
    avg_yes_gpa = np.mean(gpa_yes)
    avg_act_yes = (-intercept - avg_yes_gpa*GPAcoef)/ACTcoef

    # next, build a sensitivity matrix and check the predictions
    X_check = np.array([[avg_yes_gpa, avg_act_yes],
                           [avg_yes_gpa+0.05, avg_act_yes+.5],
                           [avg_yes_gpa+0.1, avg_act_yes+1]])
    pred_check = lr.predict_proba(X_check)


    return [Y.sum(), GPAcoef, ACTcoef, intercept, score, loss,
            avg_yes_gpa, avg_act_yes, pred_check[0][1],pred_check[1][1],
            pred_check[2][1]]
In [11]:
# We'll now iterate over each of the barrons cases for analysis.
# We skip the ones where the title begins with a #
for i, case in b_case_df.iterrows():
    if case.Title[0] != '#':
        if case.hs_class == 'ALL':
            this_df = df[(df.RACE == case.RACE)&(df.Barrons == case.Barrons)]
        else:
            this_df = df[(df.RACE == case.RACE)&(df.Barrons == case.Barrons)&(df.hs_class == 2018)]
    
        # Now get the right array to send to the analysis function:
        act_var = 'ACT50' if case.RACE == 'W/A' else 'ACT25'
        short_df = this_df[['GPA','ACT','Y',act_var]]
        short_df = short_df[pd.notnull(short_df[act_var])]
        short_df['ACT'] = short_df['ACT']-short_df[act_var]
        trial_data = short_df[['GPA','ACT','Y']].values
        
        # Now complete the regression and append the result to the output table
        if len(trial_data) > 1:
            print('%s: %d' % (case.Title, len(this_df)),end='')
            try:
                reg_response = run_lregression(trial_data)
                new_row = [case.Title, len(this_df)]
                new_row.extend(reg_response)
                output_table.append(new_row)
                print('...Works!')
            except Exception as e:
                print('...No dice!')
                #raise e
H:Most Competitive+: 490...Works!
H:Most Competitive: 810...Works!
H:Highly Competitive: 1397...Works!
H:Very Competitive: 3218...Works!
H:Competitive: 4013...Works!
H:Less Competitive: 467...Works!
H:Noncompetitive: 276...Works!
B:Most Competitive+: 176...Works!
B:Most Competitive: 389...Works!
B:Highly Competitive: 684...Works!
B:Very Competitive: 1378...Works!
B:Competitive: 3628...Works!
B:Less Competitive: 712...Works!
B:Noncompetitive: 299...Works!
W:Most Competitive+: 241...Works!
W:Most Competitive: 330...Works!
W:Highly Competitive: 325...Works!
W:Very Competitive: 645...Works!
W:Competitive: 422...Works!
W:Less Competitive: 24...No dice!
W:Noncompetitive: 31...No dice!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
In [13]:
barrons_output = pd.DataFrame(output_table[1:],columns=output_table[0])
barrons_output
Out[13]:
Case N N1 GPAcoef ACTcoef Int Score Loss 50gpa 50act 50pred Plus.05/.5red Plus.1/1pred
0 H:Most Competitive+ 490 82.0 3.269467 0.277182 -13.661696 0.834395 0.366029 4.128475 0.590886 0.5 0.574947 0.646600
1 H:Most Competitive 810 370.0 4.001205 0.258949 -13.747247 0.716644 0.545084 3.638453 -3.131696 0.5 0.581646 0.659051
2 H:Highly Competitive 1397 756.0 3.055204 0.238978 -9.221920 0.736465 0.525613 3.284773 -3.405059 0.5 0.567645 0.632858
3 H:Very Competitive 3218 1910.0 2.218157 0.395526 -5.042510 0.851073 0.378709 2.840413 -3.180506 0.5 0.576561 0.649614
4 H:Competitive 4013 3326.0 3.266927 0.454606 -7.186940 0.890890 0.273525 2.507599 -2.211151 0.5 0.596439 0.685960
5 H:Less Competitive 467 127.0 8.701126 0.688007 -20.877240 0.942408 0.195938 2.376667 0.287152 0.5 0.685477 0.826083
6 H:Noncompetitive 276 125.0 1.834104 0.564467 -3.168692 0.858974 0.378682 2.244545 -1.679529 0.5 0.592410 0.678716
7 B:Most Competitive+ 176 22.0 1.873550 0.372499 -7.266101 0.876543 0.302931 4.106923 -1.150140 0.5 0.569528 0.636419
8 B:Most Competitive 389 140.0 3.961763 0.295798 -13.546436 0.726790 0.499708 3.698269 -3.736435 0.5 0.585644 0.666406
9 B:Highly Competitive 684 364.0 2.906901 0.332415 -7.908246 0.765411 0.502075 3.299068 -5.059396 0.5 0.577264 0.650924
10 B:Very Competitive 1378 862.0 1.793500 0.513101 -3.332770 0.832465 0.399752 2.920345 -3.712463 0.5 0.585702 0.666512
11 B:Competitive 3628 2881.0 1.962268 0.457714 -3.427814 0.863822 0.319347 2.466144 -3.083632 0.5 0.581022 0.657898
12 B:Less Competitive 712 478.0 3.410053 0.383454 -7.475473 0.915596 0.226717 2.228704 -0.324744 0.5 0.589580 0.673588
13 B:Noncompetitive 299 189.0 3.166343 0.339668 -5.808775 0.866071 0.313812 1.931905 -0.907646 0.5 0.581309 0.658429
14 W:Most Competitive+ 241 23.0 3.245932 0.222532 -14.352469 0.900000 0.279727 4.170000 3.671081 0.5 0.567967 0.633468
15 W:Most Competitive 330 128.0 2.488886 0.121962 -8.937293 0.647651 0.611409 3.748138 -3.209172 0.5 0.546224 0.591665
16 W:Highly Competitive 325 201.0 2.901873 0.256741 -8.228206 0.771930 0.480871 3.299529 -5.244998 0.5 0.567943 0.633423
17 W:Very Competitive 645 428.0 2.196075 0.540214 -3.800219 0.883858 0.284568 3.027842 -5.274109 0.5 0.593852 0.681315
18 W:Competitive 422 364.0 3.131017 0.480638 -5.785281 0.931373 0.209902 2.394009 -3.558606 0.5 0.597935 0.688634
In [14]:
barrons_output.to_csv(barrons_output_file,index=False)

That completes the main calculation that most odds results are based upon

Visually inspect the odds for the following things:

  1. GPA and ACT coefficients are positive. If they're not, the odds are likely bad
  2. The last two columns show reasonable increases in odds from a GPA increase of 0.05/0.10 and ACT increase of .5/1. These are a measure of the "spread" around the 50/50 odds line. If the numbers are two big, it means the model is too certain of the results on either side of the line. This can be corrected by doubling (or tripling) all 3 coefficients

We're now going to calculate odds for specific colleges if we have enough results

A few principles:

  1. The general rule of thumb is we'll want to have at least 10 positive results and 10 negative results
  2. We'll relax the above to 5 and 5 if there is no ACT25 for the school--that means they're likely test optional, or, at a minimum, that our ACT25 based calculation for the Barrons calcs won't work for that school
  3. If we have the minimum number of results for a school for the specific race (AA, H, or W/A), we prefer to do the analysis only on the most recent year
  4. If we don't have enough results, we can extend to the prior years
In [15]:
analysis_list=[['Label','NCES','RACE','hs_class']]
current_year = 2018
picker_df = df[['NCES','RACE','Y','hs_class','ACT25','type']]
for race in ['H', 'AA', 'W/A']:
    this_df = picker_df[picker_df.RACE==race]
    nces_vals = list(set(this_df.NCES))
    for nces in nces_vals:
        nces_df = this_df[this_df.NCES == nces]
        ty_yes = sum((nces_df.Y == 1) & (nces_df.hs_class == current_year))
        ty_no = sum((nces_df.Y == 0) & (nces_df.hs_class == current_year))
        ay_yes = sum(nces_df.Y == 1)
        ay_no = sum(nces_df.Y ==0)
        act25 = this_df.ACT25.iloc[0]
        if (ty_yes >= 10) & (ty_no >= 10):
            analysis_list.append([race+':'+str(nces), nces, race, current_year])
        elif ((ay_yes >= 10)&(ay_no >=10))|(np.isnan(act25)&(ay_yes >=5)&(ay_no >= 5)):
            analysis_list.append([race+':'+str(nces), nces, race, 'ALL'])
In [16]:
len(analysis_list)
Out[16]:
291
In [17]:
anl_short=analysis_list[:5]
anl_short
Out[17]:
[['Label', 'NCES', 'RACE', 'hs_class'],
 ['H:148584', '148584', 'H', 'ALL'],
 ['H:169248', '169248', 'H', 'ALL'],
 ['H:154527', '154527', 'H', 'ALL'],
 ['H:150136', '150136', 'H', 'ALL']]
In [18]:
analysis_list[-5:]
Out[18]:
[['W/A:170976', '170976', 'W/A', 'ALL'],
 ['W/A:174844', '174844', 'W/A', 'ALL'],
 ['W/A:240444', '240444', 'W/A', 'ALL'],
 ['W/A:145637', '145637', 'W/A', 'ALL'],
 ['W/A:170532', '170532', 'W/A', 'ALL']]
In [19]:
# Now that we have a full list for analysis, we can run regressions for each
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
               '50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
for case, nces, race, hs_class in analysis_list[1:]:
    if hs_class == 'ALL':
        this_df = df[(df.RACE == race)&(df.NCES == nces)]
    else:
        this_df = df[(df.RACE == race)&(df.NCES == nces)&(df.hs_class == hs_class)]
    
    # Now get the right array to send to the analysis function:
    trial_data = this_df[['GPA','ACT','Y']].values
        
    # Now complete the regression and append the result to the output table
    if len(trial_data) > 1:
        print('%s: %d' % (case, len(this_df)),end='')
        try:
            reg_response = run_lregression(trial_data)
            new_row = [case, len(this_df)]
            new_row.extend(reg_response)
            output_table.append(new_row)
            print('...Works!')
        except Exception as e:
            print('...No dice!')
            #raise e
H:148584: 63...Works!
H:169248: 318...Works!
H:154527: 29...Works!
H:150136: 36...Works!
H:151324: 35...No dice!
H:229267: 86...No dice!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:195526: 162...No dice!
H:197869: 31...Works!
H:178615: 129...Works!
H:153603: 58...Works!
H:204024: 32...Works!
H:152673: 98...Works!
H:147411: 36...No dice!
H:146719: 139...Works!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:131496: 24...Works!
H:122612: 121...Works!
H:144281: 71...Works!
H:135726: 17...No dice!
H:144962: 119...Works!
H:117946: 35...Works!
H:213385: 199...Works!
H:153834: 150...Works!
H:153269: 38...Works!
H:143358: 118...Works!
H:149231: 257...Works!
H:144892: 291...Works!
H:152248: 14...No dice!
H:145646: 103...Works!
H:149505: 99
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
...Works!
H:148487: 55...Works!
H:149772: 247...Works!
H:164988: 134...Works!
H:147776: 131...Works!
H:153384: 66...Works!
H:120254: 70...Works!
H:154350: 46...Works!
H:145725: 94...Works!
H:168218: 14...Works!
H:138600: 24...Works!
H:144740: 434...Works!
H:202480: 158...Works!
H:147703: 208...Works!
H:179159: 87...Works!
H:212674: 35...Works!
H:147536: 275...Works!
H:209056: 42...Works!
H:169798: 28...Works!
H:211440: 25...Works!
H:128902: 31...Works!
H:146427: 77...Works!
H:221351: 40...Works!
H:150163: 49...Works!
H:202523: 69...Works!
H:178396: 68...Works!
H:143084: 101...Works!
H:215062: 34...Works!
H:179867: 128...Works!
H:219709: 43...Works!
H:143118: 148...Works!
H:174844: 179...Works!
H:149222: 342...Works!
H:190415: 40...No dice!
H:239318: 34...Works!
H:147679: 177...Works!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:211291: 81...Works!
H:154095: 67...Works!
H:153621: 59...No dice!
H:127060: 101...Works!
H:243780: 73...Works!
H:151111: 10...No dice!
H:212009: 83...Works!
H:148627: 127...Works!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:150455: 41...Works!
H:151351: 34...Works!
H:206589: 64...Works!
H:148654: 69...Works!
H:153144: 211...Works!
H:170532: 75...Works!
H:203535: 64...Works!
H:152390: 209...Works!
H:190099: 54...Works!
H:147660: 125...Works!
H:148496: 373...Works!
H:162928: 20...Works!
H:121345: 62...Works!
H:182670: 46...Works!
H:174066: 23...No dice!
H:216287: 19...Works!
H:146612: 74...Works!
H:212577: 60...Works!
H:130794: 40...Works!
H:122436: 14...No dice!
H:149781: 23
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
...Works!
H:156295: 37...Works!
H:145691: 100...Works!
H:139658: 64...Works!
H:112260: 29...Works!
H:213668: 150...Works!
H:173258: 60...Works!
H:213543: 14...Works!
H:198385: 48...Works!
H:204796: 56...Works!
H:161004: 16...Works!
H:233374: 12...Works!
H:164924: 42...Works!
H:221519: 94...Works!
H:131159: 14...No dice!
H:145637: 550...Works!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:148335: 142...Works!
H:164465: 23...Works!
H:126678: 43...Works!
H:147244: 192...Works!
H:154493: 52...Works!
H:201645: 26...Works!
H:152600: 188...Works!
H:243744: 55...Works!
H:196413: 62...Works!
H:168148: 26...Works!
H:174817: 116...Works!
H:217156: 50...Works!
H:145600: 781...Works!
H:239017: 14...No dice!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
H:137847: 71...Works!
H:204501: 65...Works!
H:144005: 111...Works!
H:152080: 65...Works!
H:132471: 60...Works!
H:239628: 286...Works!
H:231174: 37...Works!
H:176965: 21...Works!
H:186131: 29...Works!
H:239105: 161...Works!
H:199607: 33...Works!
H:230959: 47...Works!
H:150774: 45...Works!
H:170301: 196...Works!
H:146676: 223...Works!
H:170082: 88...Works!
H:167358: 32...Works!
H:153658: 358...Works!
H:173902: 23...Works!
H:145813: 442...Works!
H:170976: 163...Works!
H:Arrupe: 110...Works!
H:240444: 202...Works!
H:204857: 14...Works!
H:216597: 59...Works!
H:147767: 84...Works!
H:234076: 35...Works!
H:168546: 448...Works!
H:171100: 46...Works!
H:130697: 33...Works!
H:153278: 224...Works!
H:143288: 95...Works!
H:150400: 138...Works!
H:153162: 120...Works!
H:144351: 94...Works!
H:221999: 50...Works!
H:147341: 475...Works!
H:195030: 46...Works!
H:146481: 113...Works!
H:195003: 13...Works!
H:238333: 86...Works!
H:203517: 21...Works!
H:144050: 118...Works!
H:147013: 33...Works!
AA:148584: 32...Works!
AA:169248: 413...Works!
AA:154527: 62...Works!
AA:150136: 46...Works!
AA:151324: 68...Works!
AA:229267: 73...Works!
AA:178615: 140...Works!
AA:153603: 75...Works!
AA:141060: 79...Works!
AA:152673: 32...Works!
AA:175856: 327...Works!
AA:146719: 86...Works!
AA:122612: 64...Works!
AA:144281: 52...Works!
AA:135726: 30...Works!
AA:144962: 89...Works!
AA:153834: 195...Works!
AA:143358: 41...Works!
AA:149231: 371...Works!
AA:144892: 301...Works!
AA:152248: 44...Works!
AA:145646: 83...Works!
AA:149505: 46...Works!
AA:148487: 63...Works!
AA:149772: 330...Works!
AA:147776: 50...Works!
AA:153384: 75...Works!
AA:102234: 31...Works!
AA:169080: 73...Works!
AA:120254: 34...Works!
AA:145725: 66...Works!
AA:133650: 21...Works!
AA:131520: 96...Works!
AA:201690: 159...Works!
AA:232937: 57...Works!
AA:144740: 108...Works!
AA:202480: 173...Works!
AA:147703: 212...Works!
AA:113698: 28...Works!
AA:212674: 62...Works!
AA:147536: 425...Works!
AA:169798: 40...Works!
AA:146427: 302...Works!
AA:150163: 52...Works!
AA:202523: 28...Works!
AA:178396: 80...Works!
AA:199102: 33...Works!
AA:143084: 56...Works!
AA:179867: 102...Works!
AA:219709: 39...Works!
AA:143118: 53...Works!
AA:174844: 74...Works!
AA:149222: 76...Works!
AA:147679: 105...Works!
AA:154095: 43...Works!
AA:127060: 55...Works!
AA:243780: 54...Works!
AA:100654: 282...Works!
AA:148627: 145...Works!
AA:206589: 95...Works!
AA:148654: 352...Works!
AA:153144: 177...Works!
AA:170532: 65...Works!
AA:147660: 81...Works!
AA:148496: 265...Works!
AA:146612: 201...Works!
AA:138947: 332...Works!
AA:169910: 25...No dice!
AA:156295: 37...Works!
AA:145691: 30...Works!
AA:139658: 98...Works!
AA:201441: 39...Works!
AA:213668: 30...Works!
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
c:\users\mniksch\dropbox (nnocs)\documents\noblegit\venvs\legacy\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
AA:102377: 198...Works!
AA:148131: 36...Works!
AA:204796: 45...Works!
AA:164924: 53...Works!
AA:145637: 308...Works!
AA:145336: 32...Works!
AA:147244: 116...Works!
AA:154493: 169...Works!
AA:152600: 194...Works!
AA:196413: 66...Works!
AA:145600: 263...Works!
AA:144005: 178...Works!
AA:152080: 30...Works!
AA:239628: 134...Works!
AA:231174: 79...Works!
AA:176965: 39...Works!
AA:239105: 56...Works!
AA:150774: 113...Works!
AA:170301: 82...Works!
AA:146676: 101...Works!
AA:170082: 155...Works!
AA:140553: 107...Works!
AA:153658: 318...Works!
AA:173902: 30...Works!
AA:145813: 313...Works!
AA:170976: 93...Works!
AA:Arrupe: 41...Works!
AA:240444: 60...Works!
AA:216597: 43...Works!
AA:147767: 95...Works!
AA:100724: 68...Works!
AA:234076: 33...Works!
AA:168546: 223...Works!
AA:143288: 111...Works!
AA:150400: 33...Works!
AA:193900: 25...Works!
AA:144351: 101...Works!
AA:147341: 215...Works!
AA:146481: 36...Works!
AA:238333: 32...Works!
AA:203517: 53...Works!
AA:144050: 49...Works!
AA:147013: 47...Works!
W/A:146427: 49...Works!
W/A:202523: 42...Works!
W/A:170301: 42...Works!
W/A:170976: 72...Works!
W/A:174844: 24...Works!
W/A:240444: 60...Works!
W/A:145637: 130...Works!
W/A:170532: 22...Works!
In [20]:
len(output_table)
Out[20]:
277
In [21]:
schools_output = pd.DataFrame(output_table[1:],columns=output_table[0])
schools_output.head()
Out[21]:
Case N N1 GPAcoef ACTcoef Int Score Loss 50gpa 50act 50pred Plus.05/.5red Plus.1/1pred
0 H:148584 63 42.0 5.209164 1.028021 -33.571156 0.825397 0.295799 2.799637 18.469840 0.5 0.684487 0.824760
1 H:169248 318 291.0 7.593017 0.456457 -25.884838 0.933962 0.166390 2.335429 17.859055 0.5 0.647457 0.771316
2 H:154527 29 24.0 4.953492 0.152787 -14.555692 0.931034 0.313899 2.360950 18.723693 0.5 0.580315 0.656590
3 H:150136 36 27.0 11.980035 0.549282 -43.549191 0.888889 0.200145 2.824525 17.679957 0.5 0.705503 0.851610
4 H:197869 31 11.0 5.584194 1.039670 -40.635701 0.838710 0.276262 3.122500 22.313863 0.5 0.689770 0.831751
In [22]:
analysis_df = pd.DataFrame(analysis_list[1:],columns=analysis_list[0])
In [23]:
schools_output.to_csv(schools_output_file,index=False)
analysis_df.to_csv('anon_school_analysis_list.csv',index=False)
In [24]:
# Redo of the analysis (this was created by flagging cases
# in the file saved previously:
a_list= [
    ['H:144281','144281','H','ALL'],
    ['H:170532','170532','H','ALL'],
    ['H:Arrupe','Arrupe','H','ALL'],
    ['H:145691','145691','H','ALL'],
    ['AA:216597','216597','AA','ALL'],
    ['AA:178396','178396','AA','ALL'],
    ['AA:170532','170532','AA','ALL'],
    ['AA:Arrupe','Arrupe','AA','ALL'],
    ['AA:145691','145691','AA','ALL'],
    ['AA:170082','170082','AA','ALL'],
    ['AA:144740','144740','AA','ALL'],
]
In [25]:
output_table=[['Case','N','N1','GPAcoef','ACTcoef','Int','Score','Loss',
               '50gpa','50act', '50pred','Plus.05/.5red','Plus.1/1pred']]
for case, nces, race, hs_class in a_list[:]:
    if hs_class == 'ALL':
        this_df = df[(df.RACE == race)&(df.NCES == nces)]
    else:
        this_df = df[(df.RACE == race)&(df.NCES == nces)&(df.hs_class == hs_class)]
    
    # Now get the right array to send to the analysis function:
    trial_data = this_df[['GPA','ACT','Y']].values
        
    # Now complete the regression and append the result to the output table
    if len(trial_data) > 1:
        print('%s: %d' % (case, len(this_df)),end='')
        try:
            reg_response = run_lregression(trial_data)
            new_row = [case, len(this_df)]
            new_row.extend(reg_response)
            output_table.append(new_row)
            print('...Works!')
        except Exception as e:
            print('...No dice!')
            #raise e
H:144281: 228...Works!
H:170532: 256...Works!
H:Arrupe: 371...Works!
H:145691: 325...Works!
AA:216597: 155...Works!
AA:178396: 204...Works!
AA:170532: 161...Works!
AA:Arrupe: 172...Works!
AA:145691: 167...Works!
AA:170082: 547...Works!
AA:144740: 310...Works!
In [26]:
schools_output2 = pd.DataFrame(output_table[1:],columns=output_table[0])
schools_output2
Out[26]:
Case N N1 GPAcoef ACTcoef Int Score Loss 50gpa 50act 50pred Plus.05/.5red Plus.1/1pred
0 H:144281 228 174.0 5.514042 0.473419 -21.359345 0.903509 0.223911 2.268035 18.700768 0.5 0.625372 0.735911
1 H:170532 256 136.0 10.390712 0.371606 -44.451564 0.812500 0.393377 3.440856 23.408159 0.5 0.669370 0.803873
2 H:Arrupe 371 189.0 1.406622 0.093909 -5.325053 0.646900 0.626458 2.536979 18.704015 0.5 0.529288 0.558375
3 H:145691 325 249.0 6.913662 0.385680 -26.663059 0.870769 0.266130 2.803378 18.879492 0.5 0.631469 0.745935
4 AA:216597 155 71.0 6.062056 0.681934 -37.720571 0.838710 0.401508 3.610755 23.216287 0.5 0.655673 0.783832
5 AA:178396 204 154.0 6.682889 1.572445 -52.247808 0.946078 0.152027 3.072540 20.168823 0.5 0.754057 0.903848
6 AA:170532 161 78.0 8.119638 0.406221 -37.170041 0.863354 0.402563 3.505716 21.428974 0.5 0.647734 0.771744
7 AA:Arrupe 172 92.0 2.211974 0.041943 -5.857210 0.697674 0.577381 2.305457 18.062666 0.5 0.532845 0.565408
8 AA:145691 167 91.0 6.308057 0.384098 -23.980241 0.838323 0.317146 2.647700 18.949338 0.5 0.624209 0.733979
9 AA:170082 547 428.0 10.117137 0.703178 -41.935480 0.886654 0.247251 2.854407 18.568616 0.5 0.702127 0.847470
10 AA:144740 310 254.0 5.471804 0.583672 -26.181706 0.909677 0.204806 2.725817 19.302919 0.5 0.637707 0.755996
In [27]:
schools_output2.to_csv('anon_schools_coefs_take_2.csv',index=False)