import datetime as dt import io import pandas as pd import json import zz_NewsSocialSignaling from zz_NewsSocialSignaling import Config import numpy as np import os import zipfile import re import requests def main(): project_dir = os.path.join(Config.dir_data, '1_Survey/') # df = ingest_afro_barometer(project_dir, questions=questions) ingest_anes() # ingest_asian_barometer(project_dir) def decode_answer_map(am): if pd.isna(am) or len(am) == 0 or am.isspace(): default_answers = None else: try: default_answers = json.loads(str(am)) default_answers = dict((int(el) if el != '*' else el, default_answers[el]) for el in default_answers) except Exception as e: print(am) raise e return default_answers def parse_coding(name): fn = 'BarometerSurveyMap - {0}.csv'.format(name) df = pd.read_csv(fn) df.columns = [el.lower().replace(' ', '') for el in df.columns] waves = [int(el.replace('wave', '')) for el in df.columns if 'wave' in el] question_map = {} var_names = df.varname df.index = var_names for v in var_names: tmp_map = {} am = df.loc[v, 'answermap'] default_answers = decode_answer_map(am) for wave in waves: qid = df.loc[v, 'wave{0}'.format(wave)] if pd.isna(qid): continue if wave not in tmp_map: tmp_map[wave] = {} tmp_map[wave]['id'] = qid if wave == np.max(waves): wave_am = default_answers else: wave_am = decode_answer_map(df.loc[v, 'recode{0}'.format(wave)]) tmp_map[wave]['value_labels'] = default_answers if isinstance(wave_am, type(None)) else wave_am question_map[v] = tmp_map return question_map def ingest_wvs(project_dir): wvs_dir = os.path.join(project_dir, '4_WVS') fn = os.path.join(wvs_dir, 'WVS_Longitudinal_1981_2016_stata_v20180912.dta') df = pd.read_stata(fn, convert_categoricals=False) print(df.head()) def ingest_afro_barometer(project_dir, questions): path = os.path.join(project_dir, '1_Afrobarometer/1_Data') with open('question_map.json', 'r') as f: questions = json.load(f) afro_key = 'afrobarometer' full_df = pd.DataFrame() out_cols = [el for el in questions] for d in os.listdir(path): zf = zipfile.ZipFile(os.path.join(path, d)) files = [el for el in zf.namelist() if 'merged_' in el and '.dta' in el and 'MAC' not in el] assert len(files) == 1, ', '.join(files) with zf.open(files[0]) as s: df = pd.read_stata(s, convert_categoricals=False) df.columns = [el.lower() for el in df.columns] wave = int(re.findall('W([0-9])/', files[0])[0]) tmp_df = pd.DataFrame(index=range(len(df)), columns=out_cols) for q in questions: try: qid = questions[q][afro_key][str(wave)]['id'] answer_map = questions[q][afro_key][str(wave)]['value_labels'] # output_map = questions[q]['output_coding'] tmp_df[q] = [answer_map[str(el)] for el in df[qid]] except Exception as e: print(q) print(qid) print(df[qid].unique()) print(answer_map.keys()) raise e full_df = pd.concat([full_df, tmp_df]) return full_df def convert_values(answer_map, el): if isinstance(answer_map, type(None)): return el if pd.isna(el): return 'missing' if el in answer_map: return answer_map[el] elif '*' in answer_map: return el else: print(answer_map) print(el) raise Exception def ingest_asian_barometer(project_dir): path = os.path.join(project_dir, '2_Asianbarometer/1_Data') questions = parse_coding('Asian') full_df = pd.DataFrame() out_cols = [el for el in questions] for d in os.listdir(path): fn = os.path.join(path, d, 'Merge.zip') zf = zipfile.ZipFile(os.path.join(path, d, 'Merge.zip')) files = [el for el in zf.namelist() if 'merge' in el and '.dta' in el and 'MAC' not in el] assert len(files) == 1, ', '.join(files) with zf.open(files[0]) as s: df = pd.read_stata(s, convert_categoricals=False) df.columns = [el.lower() for el in df.columns] re_match = re.findall('W([0-9])', fn) assert len(re_match) == 1 wave = int(re_match[0]) tmp_df = pd.DataFrame(index=range(len(df)), columns=out_cols) for q in questions: if wave not in questions[q]: continue try: qid = questions[q][wave]['id'].lower() answer_map = questions[q][wave]['value_labels'] tmp_df[q] = [convert_values(answer_map, el) for el in df[qid]] # output_map = questions[q]['output_coding'] except Exception as e: print(wave) print(q) print(qid) print(df[qid].unique()) print(answer_map) print(questions[q]) raise e full_df = pd.concat([full_df, tmp_df]) # replace Korea w/ South Korea full_df.loc[full_df.countryraw == 'Korea', 'countryraw'] = 'South Korea' full_df.loc[full_df.countryraw == 'Myanmar', 'countryraw'] = 'Myanmar (Burma)' # add interview month new_interview_month = [] full_df.index = range(len(full_df)) for ix in full_df.index: im = full_df.loc[ix, 'interviewmonth'] iy = full_df.loc[ix, 'interviewyear'] if pd.isna(im): new_interview_month.append(np.nan) continue if isinstance(im, dt.datetime): new_interview_month.append(dt.date(im.year, im.month, 1)) else: new_interview_month.append(dt.date(int(iy), int(im), 1)) assert len(new_interview_month) == len(full_df) full_df['interviewmonth'] = new_interview_month full_df = full_df[[el for el in full_df.columns if el != 'interviewyear']] # save out_fn = os.path.join(os.path.dirname(path), '2_Cleaned', 'cleaned_asian_survey.txt') full_df.to_csv(out_fn, index=False) def ingest_anes(): d = os.path.join(Config.dir_anes, '1_Data') question_map = { 'year': {'id': 'VCF0004', 'value_labels': {'*': ''}}, 'survey_respondent_number': {'id': 'VCF0006', 'value_labels': {'*': ''}}, 'idsurveystr': {'id': 'VCF0006a', 'value_labels': {'*': ''}}, 'has_pre': {'id': 'VCF0014', 'value_labels': {'0': False, '1': True}}, 'has_post': {'id': 'VCF0013', 'value_labels': {'0': False, '1': True}}, 'age': {'id': 'VCF0101', 'value_labels': {'00': 'NA', '0': 'NA', '*': ''}}, 'gender': {'id': 'VCF0104', 'value_labels': {'1': 'male', '2': 'female', '3': 'other', '0': 'NA'}}, 'education': {'id': 'VCF0110', 'value_labels': {'1': 'grade school or less', '2': 'high school', '3': 'some college', '4': 'college or more', '0': 'NA'}}, 'polparty': {'id': 'VCF0301', 'value_labels': {'1': 'strong democrat', '2': 'weak democrat', '3': 'independent - democrat', '4': 'independent - ind', '5': 'independent - republican', '6': 'weak republican', '7': 'strong republican', '0': 'NA'}}, 'interest_elections': {'id': 'VCF0310', 'value_labels': {'1': 'not much', '2': 'somewhat', '3': 'very much', '9': 'DK', '0': 'NA'}}, 'interest_publicaffairs': {'id': 'VCF0313', 'value_labels': {'1': 'hardly ever', '2': 'now and then', '3': 'some of the time', '4': 'most of the time', '9': 'DK', '0': 'NA'}}, 'turnout': {'id': 'VCF0703', 'value_labels': {'1': 'not registered', '2': 'register, no vote', '3': 'voted', '0': 'DK/NA'}}, 'discuss': {'id': 'VCF0731', 'value_labels': {'1': 'yes', '5': 'no', '8': 'DK', '9': 'NA', '6': 'not in codebook', '7': 'not in codebook'}}, 'discuss_freq': {'id': 'VCF0732', 'value_labels': {'1': 'daily', '2': '3-4 times a week', '3': '1-2 a week', '4': 'less often', '5': 'never', '9': 'NA/DK'}}, 'discuss_freq_last_week': {'id': 'VCF0733', 'value_labels': {'0': 'none', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '9': 'DK/NA'}}, 'state_close_pres': {'id': 'VCF9029', 'value_labels': {'1': 'close', '3': 'win by quite a bit', '8': 'DK/Depends', '9': 'NA'}}, 'newspaper_days_last_week': {'id': 'VCF9033', 'value_labels': {'0': '0 day', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '8': 'DK', '9': 'NA'}}, 'newspapers_binary': {'id': 'VCF9034', 'value_labels': {'1': 'yes', '5': 'no', '8': 'DK', '9': 'NA'}}, 'tvnews_days_last_week': {'id': 'VCF9035', 'value_labels': {'0': '0 day', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '8': 'DK', '9': 'NA'}}, 'senate_race': {'id': 'VCF9054', 'value_labels': {'1': 'yes', '2': 'no', '9': 'NA (DC)'}}, 'senate_race_type': {'id': 'VCF9055', 'value_labels': {'*': ''}}, 'days_pre_election': {'id': 'VCF1015', 'value_labels': {'*': '', '99': 'NA'}}, 'days_post_election': {'id': 'VCF1016', 'value_labels': {'*': '', '99': 'NA'}} # {'12': 'D incumb - R chall', '13': 'D incum - Other chall', '14': 'D incum - no chall', '19': 'D incum - R + O challs', # '21': 'R incumb - D chall', '23': 'R incum - Other chall', '24': 'R incum - no chall', '29': 'R incum - D + O challs', # '51': 'D - no chall', '52': 'R - no chall', '55': 'D + R', '59': 'D + R + O', # '61': 'D - no chall', '62': 'R - no chall', '65': 'D + R', '69': 'D + R + O', '75': 'D + R', # '81': '2 D incumbs (no race)', '82': ''}} } # NOTE THERE IS SOME VALIDATION OF VOTING & REGISTRATION DATA!!! for el in question_map: question_map[el]['value_labels']['INAP'] = 'Not Used' question_converters = dict((el, SurveyQuestion(var_name=question_map[el]['id'], value_labels=question_map[el]['value_labels'])) for el in question_map) fn = os.path.join(d, 'anes_timeseries_cdf_dta.zip') zf = zipfile.ZipFile(fn) files = [el for el in zf.namelist() if '.dta' in el and 'stata13' not in el] assert len(files) == 1, ', '.join(files) with zf.open(files[0]) as s: raw_df = pd.read_stata(s, convert_categoricals=False) raw_df.columns = [el.upper() for el in raw_df.columns] raw_df.index = range(len(raw_df)) df = pd.DataFrame(index=range(len(raw_df)), columns=list(question_converters.keys())) assert np.all(df.index == raw_df.index) for el in question_converters: print(el) df[el] = raw_df[question_converters[el].var_name].apply(question_converters[el].convert_values) df.to_csv(Config.fn_anes, index=False) print(df.columns) class SurveyQuestion: def __init__(self, var_name, value_labels): self.var_name = var_name.upper() self.value_labels = value_labels for el in list(self.value_labels.keys()): try: self.value_labels[int(el)] = self.value_labels[el] except Exception as e: pass def convert_values(self, el): answer_map = self.value_labels if isinstance(answer_map, type(None)): return el if pd.isna(el): return 'missing' if el in answer_map: return answer_map[el] elif '*' in answer_map: return el else: print(answer_map) print(el) raise Exception if __name__ == '__main__': main()