import datetime as dt
import io
import pandas as pd
import json
import zz_NewsSocialSignaling
from zz_NewsSocialSignaling import Config
import numpy as np
import os
import zipfile
import re
import requests


def main():
    project_dir = os.path.join(Config.dir_data, '1_Survey/')

    # df = ingest_afro_barometer(project_dir, questions=questions)
    ingest_anes()
    # ingest_asian_barometer(project_dir)


def decode_answer_map(am):
    if pd.isna(am) or len(am) == 0 or am.isspace():
        default_answers = None
    else:
        try:
            default_answers = json.loads(str(am))
            default_answers = dict((int(el) if el != '*' else el, default_answers[el]) for el in default_answers)
        except Exception as e:
            print(am)
            raise e
    return default_answers


def parse_coding(name):
    fn = 'BarometerSurveyMap - {0}.csv'.format(name)
    df = pd.read_csv(fn)
    df.columns = [el.lower().replace(' ', '') for el in df.columns]
    waves = [int(el.replace('wave', '')) for el in df.columns if 'wave' in el]
    question_map = {}
    var_names = df.varname
    df.index = var_names
    for v in var_names:
        tmp_map = {}
        am = df.loc[v, 'answermap']
        default_answers = decode_answer_map(am)
        for wave in waves:
            qid = df.loc[v, 'wave{0}'.format(wave)]
            if pd.isna(qid):
                continue
            if wave not in tmp_map:
                tmp_map[wave] = {}
            tmp_map[wave]['id'] = qid
            if wave == np.max(waves):
                wave_am = default_answers
            else:
                wave_am = decode_answer_map(df.loc[v, 'recode{0}'.format(wave)])
            tmp_map[wave]['value_labels'] = default_answers if isinstance(wave_am, type(None)) else wave_am
        question_map[v] = tmp_map
    return question_map


def ingest_wvs(project_dir):
    wvs_dir = os.path.join(project_dir, '4_WVS')
    fn = os.path.join(wvs_dir, 'WVS_Longitudinal_1981_2016_stata_v20180912.dta')
    df = pd.read_stata(fn, convert_categoricals=False)
    print(df.head())


def ingest_afro_barometer(project_dir, questions):
    path = os.path.join(project_dir, '1_Afrobarometer/1_Data')

    with open('question_map.json', 'r') as f:
        questions = json.load(f)

    afro_key = 'afrobarometer'
    full_df = pd.DataFrame()
    out_cols = [el for el in questions]

    for d in os.listdir(path):
        zf = zipfile.ZipFile(os.path.join(path, d))
        files = [el for el in zf.namelist() if 'merged_' in el and '.dta' in el and 'MAC' not in el]
        assert len(files) == 1, ', '.join(files)
        with zf.open(files[0]) as s:
            df = pd.read_stata(s, convert_categoricals=False)
        df.columns = [el.lower() for el in df.columns]
        wave = int(re.findall('W([0-9])/', files[0])[0])
        tmp_df = pd.DataFrame(index=range(len(df)), columns=out_cols)
        for q in questions:
            try:
                qid = questions[q][afro_key][str(wave)]['id']
                answer_map = questions[q][afro_key][str(wave)]['value_labels']
                # output_map = questions[q]['output_coding']
                tmp_df[q] = [answer_map[str(el)] for el in df[qid]]
            except Exception as e:
                print(q)
                print(qid)
                print(df[qid].unique())
                print(answer_map.keys())
                raise e
        full_df = pd.concat([full_df, tmp_df])
    return full_df


def convert_values(answer_map, el):
    if isinstance(answer_map, type(None)):
        return el
    if pd.isna(el):
        return 'missing'
    if el in answer_map:
        return answer_map[el]
    elif '*' in answer_map:
        return el
    else:
        print(answer_map)
        print(el)
        raise Exception


def ingest_asian_barometer(project_dir):
    path = os.path.join(project_dir, '2_Asianbarometer/1_Data')
    questions = parse_coding('Asian')

    full_df = pd.DataFrame()
    out_cols = [el for el in questions]

    for d in os.listdir(path):
        fn = os.path.join(path, d, 'Merge.zip')
        zf = zipfile.ZipFile(os.path.join(path, d, 'Merge.zip'))
        files = [el for el in zf.namelist() if 'merge' in el and '.dta' in el and 'MAC' not in el]
        assert len(files) == 1, ', '.join(files)
        with zf.open(files[0]) as s:
            df = pd.read_stata(s, convert_categoricals=False)
        df.columns = [el.lower() for el in df.columns]
        re_match = re.findall('W([0-9])', fn)
        assert len(re_match) == 1
        wave = int(re_match[0])
        tmp_df = pd.DataFrame(index=range(len(df)), columns=out_cols)
        for q in questions:
            if wave not in questions[q]:
                continue
            try:
                qid = questions[q][wave]['id'].lower()
                answer_map = questions[q][wave]['value_labels']
                tmp_df[q] = [convert_values(answer_map, el) for el in df[qid]]
                # output_map = questions[q]['output_coding']
            except Exception as e:
                print(wave)
                print(q)
                print(qid)
                print(df[qid].unique())
                print(answer_map)
                print(questions[q])
                raise e
        full_df = pd.concat([full_df, tmp_df])

    # replace Korea w/ South Korea
    full_df.loc[full_df.countryraw == 'Korea', 'countryraw'] = 'South Korea'
    full_df.loc[full_df.countryraw == 'Myanmar', 'countryraw'] = 'Myanmar (Burma)'

    # add interview month
    new_interview_month = []
    full_df.index = range(len(full_df))
    for ix in full_df.index:
        im = full_df.loc[ix, 'interviewmonth']
        iy = full_df.loc[ix, 'interviewyear']
        if pd.isna(im):
            new_interview_month.append(np.nan)
            continue
        if isinstance(im, dt.datetime):
            new_interview_month.append(dt.date(im.year, im.month, 1))
        else:
            new_interview_month.append(dt.date(int(iy), int(im), 1))
    assert len(new_interview_month) == len(full_df)
    full_df['interviewmonth'] = new_interview_month
    full_df = full_df[[el for el in full_df.columns if el != 'interviewyear']]

    # save
    out_fn = os.path.join(os.path.dirname(path), '2_Cleaned', 'cleaned_asian_survey.txt')
    full_df.to_csv(out_fn, index=False)


def ingest_anes():
    d = os.path.join(Config.dir_anes, '1_Data')

    question_map = {
        'year': {'id': 'VCF0004', 'value_labels': {'*': ''}},
        'survey_respondent_number': {'id': 'VCF0006', 'value_labels': {'*': ''}},
        'idsurveystr': {'id': 'VCF0006a', 'value_labels': {'*': ''}},
        'has_pre': {'id': 'VCF0014', 'value_labels': {'0': False, '1': True}},
        'has_post': {'id': 'VCF0013', 'value_labels': {'0': False, '1': True}},
        'age': {'id': 'VCF0101', 'value_labels': {'00': 'NA', '0': 'NA', '*': ''}},
        'gender': {'id': 'VCF0104', 'value_labels': {'1': 'male', '2': 'female', '3': 'other', '0': 'NA'}},
        'education': {'id': 'VCF0110', 'value_labels': {'1': 'grade school or less', '2': 'high school', '3': 'some college', '4': 'college or more', '0': 'NA'}},
        'polparty': {'id': 'VCF0301', 'value_labels': {'1': 'strong democrat', '2': 'weak democrat', '3': 'independent - democrat',
                                                       '4': 'independent - ind', '5': 'independent - republican',
                                                       '6': 'weak republican', '7': 'strong republican', '0': 'NA'}},
        'interest_elections': {'id': 'VCF0310', 'value_labels': {'1': 'not much', '2': 'somewhat', '3': 'very much', '9': 'DK', '0': 'NA'}},
        'interest_publicaffairs': {'id': 'VCF0313', 'value_labels': {'1': 'hardly ever', '2': 'now and then', '3': 'some of the time',
                                                                     '4': 'most of the time', '9': 'DK', '0': 'NA'}},
        'turnout': {'id': 'VCF0703', 'value_labels': {'1': 'not registered', '2': 'register, no vote', '3': 'voted', '0': 'DK/NA'}},
        'discuss': {'id': 'VCF0731', 'value_labels': {'1': 'yes', '5': 'no', '8': 'DK', '9': 'NA', '6': 'not in codebook', '7': 'not in codebook'}},
        'discuss_freq': {'id': 'VCF0732', 'value_labels': {'1': 'daily', '2': '3-4 times a week', '3': '1-2 a week', '4': 'less often', '5': 'never', '9': 'NA/DK'}},
        'discuss_freq_last_week': {'id': 'VCF0733', 'value_labels': {'0': 'none', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '9': 'DK/NA'}},
        'state_close_pres': {'id': 'VCF9029', 'value_labels': {'1': 'close', '3': 'win by quite a bit', '8': 'DK/Depends', '9': 'NA'}},
        'newspaper_days_last_week': {'id': 'VCF9033', 'value_labels': {'0': '0 day', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '8': 'DK', '9': 'NA'}},
        'newspapers_binary': {'id': 'VCF9034', 'value_labels': {'1': 'yes', '5': 'no', '8': 'DK', '9': 'NA'}},
        'tvnews_days_last_week': {'id': 'VCF9035', 'value_labels': {'0': '0 day', '1': '1 day', '2': '2 day', '3': '3 day', '4': '4 day', '5': '5 day', '6': '6 day', '7': 'every day', '8': 'DK', '9': 'NA'}},
        'senate_race': {'id': 'VCF9054', 'value_labels': {'1': 'yes', '2': 'no', '9': 'NA (DC)'}},
        'senate_race_type': {'id': 'VCF9055', 'value_labels': {'*': ''}},
        'days_pre_election': {'id': 'VCF1015', 'value_labels': {'*': '', '99': 'NA'}},
        'days_post_election': {'id': 'VCF1016', 'value_labels': {'*': '', '99': 'NA'}}
        # {'12': 'D incumb - R chall', '13': 'D incum - Other chall', '14': 'D incum - no chall', '19': 'D incum - R + O challs',
            #                                                    '21': 'R incumb - D chall', '23': 'R incum - Other chall', '24': 'R incum - no chall', '29': 'R incum - D + O challs',
            #                                                    '51': 'D - no chall', '52': 'R - no chall', '55': 'D + R', '59': 'D + R + O',
            #                                                    '61': 'D - no chall', '62': 'R - no chall', '65': 'D + R', '69': 'D + R + O', '75': 'D + R',
            #                                                    '81': '2 D incumbs (no race)', '82': ''}}
    }
    # NOTE THERE IS SOME VALIDATION OF VOTING & REGISTRATION DATA!!!
    for el in question_map:
        question_map[el]['value_labels']['INAP'] = 'Not Used'
    question_converters = dict((el, SurveyQuestion(var_name=question_map[el]['id'], value_labels=question_map[el]['value_labels'])) for el in question_map)

    fn = os.path.join(d, 'anes_timeseries_cdf_dta.zip')
    zf = zipfile.ZipFile(fn)
    files = [el for el in zf.namelist() if '.dta' in el and 'stata13' not in el]
    assert len(files) == 1, ', '.join(files)
    with zf.open(files[0]) as s:
        raw_df = pd.read_stata(s, convert_categoricals=False)
    raw_df.columns = [el.upper() for el in raw_df.columns]
    raw_df.index = range(len(raw_df))
    df = pd.DataFrame(index=range(len(raw_df)), columns=list(question_converters.keys()))
    assert np.all(df.index == raw_df.index)
    for el in question_converters:
        print(el)
        df[el] = raw_df[question_converters[el].var_name].apply(question_converters[el].convert_values)
    df.to_csv(Config.fn_anes, index=False)
    print(df.columns)


class SurveyQuestion:
    def __init__(self, var_name, value_labels):
        self.var_name = var_name.upper()
        self.value_labels = value_labels
        for el in list(self.value_labels.keys()):
            try:
                self.value_labels[int(el)] = self.value_labels[el]
            except Exception as e:
                pass

    def convert_values(self, el):
        answer_map = self.value_labels
        if isinstance(answer_map, type(None)):
            return el
        if pd.isna(el):
            return 'missing'
        if el in answer_map:
            return answer_map[el]
        elif '*' in answer_map:
            return el
        else:
            print(answer_map)
            print(el)
            raise Exception


if __name__ == '__main__':
    main()