import pandas as pd import numpy as np import statsmodels.formula.api as smf from ResearchTools import ChartTools from matplotlib import pyplot as plt import os def simulate_estimator(n, public_share, share_news_only, read_te, slant_te): df = pd.DataFrame(index=range(n)) df['public'] = [int(el) for el in np.random.rand(n) < public_share] df['news_only'] = [int(el) for el in np.random.rand(n) < share_news_only] df['read_news'] = [int(el) for el in np.random.rand(n) < (0.5 + read_te * df.public)] df.loc[df.news_only == 1, 'read_news'] = np.nan # for read analysis, need to have option to read either news or sports df['news_slant'] = np.random.randn(n) + slant_te * df.public df.loc[df.news_only == 0, 'news_slant'] = np.nan # for slant analysis, need to have only news options. Could code sports as 0 slant? # hypothesis 1 mod1 = smf.ols('read_news ~ 1 + public', data=df).fit(cov_type='HC2') hyp_1_p_val = mod1.pvalues['public'] # hypothesis 2 mod2 = smf.ols('news_slant ~ 1 + public', data=df).fit(cov_type='HC2') hyp_2_p_val = mod2.pvalues['public'] return { 'h1': hyp_1_p_val, 'h2': hyp_2_p_val } def mde(n, required_power, share_news_only, sig_p, public_share): num_sims = 1000 effect_sizes = np.linspace(0, 0.2, 20) ix = 0 mdes = {} while True: effect_size = effect_sizes[ix] print(effect_size) p_vals = {'h1': [], 'h2': []} for b in range(num_sims): tmp_ps = simulate_estimator(n=n, public_share=public_share, share_news_only=share_news_only, read_te=effect_size, slant_te=effect_size) p_vals['h1'].append(tmp_ps['h1']) p_vals['h2'].append(tmp_ps['h1']) # now check if we can reject 80% of time for either hypothesis for h in p_vals: if h in mdes: continue if np.mean([int(el < sig_p) for el in p_vals[h]]) >= required_power: mdes[h] = effect_size if len(mdes) == len(p_vals): break ix += 1 return mdes def main(): required_power = 0.8 public_share = 0.5 share_news_only = 0.5 sig_p = 0.05 # simulate_estimator(n=1000, public_share=public_share, share_news_only=share_news_only, read_te=0.1, slant_te=0.1) mdes = pd.DataFrame(index=[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]) for n in mdes.index: tmp_mde = mde(n=n, required_power=required_power, share_news_only=share_news_only, sig_p=sig_p, public_share=public_share) mdes.loc[n, 'h1'] = tmp_mde['h1'] mdes.loc[n, 'h2'] = tmp_mde['h2'] fig, ax = plt.subplots() mdes.plot(ax=ax) plt.xlabel('') ChartTools.save_show_plot(fig=fig, fn=os.path.join(os.path.dirname(__file__), 'mde.pdf'), pickle_fig=False, show_graph=False) if __name__ == '__main__': main()