import datetime as dt import numpy as np import os import pandas as pd from matplotlib import pyplot as plt from zz_NewsSocialSignaling import Config, Helper from ResearchTools import ChartTools def analyze_asian_barometer(): min_obs_year = 100 election_data = pd.read_stata(Config.fn_elections) df = pd.read_table(Config.fn_asian_barometer, sep=',', low_memory=False) # filter obs w/o good interview date df = df.loc[~pd.isna(df.interviewmonth)].copy() df['interviewmonth'] = df.interviewmonth.apply(lambda d: dt.datetime.strptime(d, '%Y-%m-%d').date()) df['interviewyear'] = df.interviewmonth.apply(lambda d: d.year) year_counts = df.interviewyear.value_counts() yrs2drop = list(year_counts[year_counts < min_obs_year].index) df = df.loc[~df.interviewyear.isin(yrs2drop)].copy() # check countries w/ matching election data df.countryraw = df.countryraw.apply(lambda s: s.upper()) election_countries = list(election_data.country.unique()) for el in df.countryraw.unique(): if el not in election_countries: print('{0} not in election data'.format(el)) # add months to next election and months from last election df['closest_elect'] = -100000 # df['next_closest_elect'] = -1 country2elec = dict((el, [el.date() for el in election_data.loc[election_data.country == el].electmonth]) for el in df.countryraw.unique() if el in election_countries) for ix in df.index: country = df.loc[ix, 'countryraw'] if country not in country2elec: continue survey_month = df.loc[ix, 'interviewmonth'] elect_diffs = [Helper.month_diff(survey_month, el) for el in country2elec[country]] closest_elect_abs = 100000 closest_elect = 1000000 for el in elect_diffs: if np.abs(el) < closest_elect_abs: closest_elect_abs = np.abs(el) closest_elect = el if closest_elect_abs > 12: x = 1 if closest_elect_abs == 2: x = 1 df.loc[ix, 'closest_elect'] = closest_elect # plot survey by election counts num_months = 24 tmp = df.loc[np.abs(df.closest_elect) <= num_months].closest_elect.value_counts() for el in range(-num_months, num_months+1): if el not in tmp.index: tmp[el] = 0 tmp = tmp.sort_index() fig, ax = plt.subplots() tmp.plot.bar(ax=ax) plt.title('Asian Barometer Number of Surveys') plt.xlabel('Months to Nearest Election') plt.ylabel('Number of Surveys') ChartTools.save_show_plot(fig=fig, fn=os.path.join(Config.dir_asian_barometer_figs, 'survey_relative_to_elections.png'), show_graph=False, pickle_fig=False) for country in df.countryraw.unique(): if country not in country2elec: continue print(country) dates = country2elec[country] dates.sort() print([el.strftime('%Y-%m') for el in dates]) print(df.loc[df.countryraw == country].interviewmonth.value_counts()) def analyze_anes(): df = pd.read_table(Config.fn_anes, sep=',') # drop missing vals old_len = len(df) df = df.loc[(df.days_pre_election != 'missing') | (df.days_post_election != 'missing')] print('Dropping {0}/{1} obs b/c missing. {2} Remain'.format(old_len - len(df), old_len, len(df))) assert not np.any((df.days_pre_election == 'missing') & (df.days_post_election == 'missing')) df['days2election'] = [float(df.loc[el, 'days_post_election']) if df.loc[el, 'days_pre_election'] == 'missing' else -float(df.loc[el, 'days_pre_election']) for el in df.index] df['pre_election'] = df['days2election'] < 0 df['weeks2election'] = df.days2election.apply(lambda d: int(d / 7)) old_len = len(df) df = df[[el for el in df.columns if el != 'days_post_election' and el != 'days_pre_election']].copy() # df = df.loc[~np.any(df.values == 'missing', axis=1)].copy() print('Dropping {0}/{1} obs b/c missing. {2} Remain'.format(old_len - len(df), old_len, len(df))) # restrict to surveys w/ both pre and post old_len = len(df) df = df.loc[df.has_post & df.has_pre] print('Dropping {0}/{1} obs b/c missing pre or post. {2} Remain'.format(old_len - len(df), old_len, len(df))) # plot survey by election counts num_weeks = 12 tmp = df.loc[np.abs(df.weeks2election) <= num_weeks].weeks2election.value_counts() for el in range(-num_weeks, num_weeks + 1): if el not in tmp.index: tmp[el] = 0 tmp = tmp.sort_index() fig, ax = plt.subplots() tmp.plot.bar(ax=ax) plt.title('ANES Number of Surveys') plt.xlabel('Weeks to Nearest Election') plt.ylabel('Number of Surveys') ChartTools.save_show_plot(fig=fig, fn=os.path.join(Config.dir_anes_figs, 'survey_relative_to_elections.png'), show_graph=False, pickle_fig=False) df = df.applymap(lambda x: np.nan if x == 'missing' else x) # plot news consumption relative to election tmp = df.loc[(~pd.isna(df.newspaper_days_last_week)) & (~pd.isna(df.tvnews_days_last_week))] aggs = dict((el, pd.DataFrame(index=list(np.sort(tmp.newspaper_days_last_week.unique())), columns=['newspaper', 'tv news'])) for el in ['pre', 'post']) tmp_dfs = { 'pre': tmp.loc[tmp.pre_election].copy(), 'post': tmp.loc[~tmp.pre_election].copy() } for key in aggs: print('Number of {0} election observations: {1}'.format(key, len(tmp_dfs[key]))) n = len(tmp_dfs[key]) assert tmp_dfs[key].newspaper_days_last_week.value_counts().sum() == n, str( 'Expected {0}, got {1}'.format(n, tmp_dfs[key].newspaper_days_last_week.value_counts().sum())) assert tmp_dfs[key].tvnews_days_last_week.value_counts().sum() == n for ix in aggs[key].index: aggs[key].loc[ix, 'newspaper'] = (tmp_dfs[key].newspaper_days_last_week == ix).sum() / n aggs[key].loc[ix, 'tv news'] = (tmp_dfs[key].tvnews_days_last_week == ix).sum() / n print(aggs[key]) assert np.all(aggs[key].sum() > 0.99) and np.all(aggs[key].sum() < 1.01) # for country in df.countryraw.unique(): # if country not in country2elec: # continue # print(country) # dates = country2elec[country] # dates.sort() # print([el.strftime('%Y-%m') for el in dates]) # print(df.loc[df.countryraw == country].interviewmonth.value_counts()) if __name__ == '__main__': analyze_anes()