import sys
import xgboost as xgb
import shap
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# path (python console vs. terminal)
path_pre = ""
if len(sys.argv) > 1:
    if sys.argv[1] == "0":
        path_pre = "../"
        from predict_house_prices_methods import *

        print("Pre-path appended.")
    else:
        from Code.predict_house_prices_methods import *

# random seed
rnd_state = 0
np.random.rand(rnd_state)

# flags
pred_plot = False
shap_plot = True
bagging = True
include_chemnitz = True
tune_fresh = False
boosting = False
diminish_muc_hh = False

if pred_plot:
    # plot style
    plt.style.use(["science"])

# get immo data
immonet_data = get_immonet_data(path_pre, include_chemnitz)

# get zip map for feature engineering: share of green voters and unemployment
zip_mapping = get_zip_mapping(path_pre)

# only keep data that falls into ZIP codes of cities
immonet_data = immonet_data[immonet_data.zip_code.isin(zip_mapping["zips"].values)]

# join unemployment ("Unterdurchschnittlich" = 1, "Durchschnittlich" = 2, "Überdurchschnittlich" = 3)
immonet_data = immonet_data.join(other=get_zip_features(zip_mapping), on="zip_code")

# get model data
model_data = immonet_data.drop(columns=["url", "soup", "title", "zip_code", "object_states", "heat_type"])  # "ort",
model_data = pd.get_dummies(model_data, columns=['ort'])

# rename features
model_data.columns = rename_house_features(model_data.columns)

# process price to price/qm
model_data["price"] = model_data["price"] / model_data["sq_meters"]
model_data["price"] = [round(p / 500, 0) * 500 for p in model_data["price"]]
model_data.drop(columns="sq_meters", inplace=True)

# remove upper outliers
model_data = model_data[model_data.price < np.quantile(model_data.price, 0.995)]  # remove 0.5% top prices

# diminish München / Hamburg
if diminish_muc_hh:
    indices_to_be_removed = model_data[(model_data.Hamburg == 1) | (model_data.Berlin == 1) | (model_data.Muenchen == 1)].index.values
    indices_to_be_removed = random.choices(indices_to_be_removed, k=int(len(indices_to_be_removed) * 0.5))
    model_data = model_data[~model_data.index.isin(indices_to_be_removed)]


# do bagging for number of rooms, construction year, floors
year_range, floor_range, room_range = range(2012, 2023, 1), [2, 3], [3, 3.5]
if bagging:
    test_subset = find_subset(model_data, year_range, floor_range, room_range)
    for data in [model_data, test_subset]:
        data.loc[:, "floor (storey)"] = [1 if f in floor_range else 0 if f < min(floor_range) else 2 for f in
                                         data["floor (storey)"]]
        # data.loc[:, "construction year"] = [1 if y < 1946 else 3 if y >= np.min(year_range) else 2 for y in
        #                               data["construction year"]]
        # data.loc[:, "nmbr of rooms"] = [1 if r < np.min(room_range) else 3 if r > np.max(room_range) else 2 for r in
        #                            data["nmbr of rooms"]]
        data.loc[:, "construction year"] = [
            1 if y < 1942 else 2 if y < 1952 else 3 if y < 1962 else 4 if y < 1972 else 5
            if y < 1982 else 6 if y < 1992 else 7 if y < 2002 else 8 if y < 2012 else 9
            for y in data["construction year"]]
        data.loc[:, "nmbr of rooms"] = [int(r) for r in data["nmbr of rooms"]]

# remove experimental indices before randomly splitting...
experiment_indices = [1353, 1273, 307]
train, test = train_test_split(model_data[~model_data.index.isin(experiment_indices)], test_size=0.05, random_state=rnd_state)
# ...and append them again afterwards
test = test.append(model_data[model_data.index.isin(experiment_indices)])

# data splits
train, test = train_test_split(model_data, test_size=0.05, random_state=rnd_state)
xtrain, ytrain = train.drop(columns="price"), train["price"]
xtest, ytest = test.drop(columns="price"), test["price"]

# get XGBoost/RndForest Regressor
params = {}
add_str = ""
if include_chemnitz:
    add_str = add_str + "_wChem"
if bagging:
    add_str = add_str + "_bagged"
if boosting:
    pred_model = xgb.XGBRegressor(verbosity=0)
    if tune_fresh:
        search_grid = {'n_estimators': [250, 500, 1000, 2000], 'max_depth': [1, 2, 4], 'subsample': [.5, .75, 1],
                       'learning_rate': [.001, 0.01, .1], 'random_state': [rnd_state]}
        grid_search = GridSearchCV(estimator=pred_model, param_grid=search_grid, n_jobs=3)
        grid_search.fit(xtrain, ytrain)
        file = open(path_pre + "Data/Param/real_estate_xgboost_param" + add_str + ".txt", "w+")
        file.write(str(grid_search.best_params_))
        params = grid_search.best_params_
    else:
        params_file = open(path_pre + "Data/Param/real_estate_xgboost_param" + add_str + ".txt",
                           "r")  # get search param
        params = eval(params_file.read())  # save parameters from file
else:
    pred_model = RandomForestRegressor()
    if tune_fresh:
        search_grid = {'max_depth': [6, 8, 10, 12, 14],
                       'max_features': ['sqrt'],
                       'min_samples_leaf': [1, 2],
                       'min_samples_split': [8, 10, 12],
                       'n_estimators': [2000, 3000, 4000], 'random_state': [rnd_state]}
        grid_search = GridSearchCV(estimator=pred_model, param_grid=search_grid, n_jobs=3)
        grid_search.fit(xtrain, ytrain)
        file = open(path_pre + "Data/Param/real_estate_rndforest_param" + add_str + ".txt", "w+")
        file.write(str(grid_search.best_params_))
        params = grid_search.best_params_
    else:
        params_file = open(path_pre + "Data/Param/real_estate_rndforest_param" + add_str + ".txt",
                           "r")  # get search param
        params = eval(params_file.read())  # save parameters from file
pred_model.set_params(**params)
pred_model.fit(xtrain, ytrain)
score = pred_model.score(xtrain, ytrain)
print("Training score: ", score)

# get k-fold cv average score
kfold = KFold(n_splits=5, shuffle=True)
kf_cv_scores = cross_val_score(pred_model, xtrain, ytrain, cv=kfold, scoring="r2")
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

# test scores
ypred = pred_model.predict(xtest)
mse = mean_squared_error(ytest, ypred)
r_sq = r2_score(ytest, ypred)
print("R^2: %.2f" % r_sq)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse ** (1 / 2.0)))

if pred_plot:
    # plot
    x_ax = range(len(ytest))
    plt.plot(x_ax, ytest, label="original")
    plt.plot(x_ax, ypred, label="predicted")
    plt.title("Immonet price prediction [$R^2 =$" + str(round(kf_cv_scores.mean(), 2)) + "]")
    plt.ylabel("Price/$m^2$  [1,000 EUR]")
    plt.yticks([4000, 8000, 12000, 16000], ["4", "8", "12", "16"])
    plt.xlabel("Test samples")
    plt.legend()
    plt.tight_layout()
    plt.savefig(path_pre + "Plots/real_estate_pred" + add_str + ".pdf")
    # plt.show()
    plt.close()

for c in xtrain.columns:
    xtrain[c] = xtrain[c].astype(float)
    xtest[c] = xtest[c].astype(float)

if shap_plot:
    # plt shap values
    explainer = shap.Explainer(pred_model, xtrain)
    shap_values = explainer(xtrain, check_additivity=False)
    # shap.dependence_plot("elevator", shap_values, x)
    shap.plots.beeswarm(shap_values, show=False)
    plt.title("Immonet price prediction")
    plt.tight_layout()
    plt.savefig(path_pre + "Plots/real_estate_shap" + add_str + ".pdf")
    # plt.show()
    plt.close()

# cities for stages 1, 2, 3
city1, city2 = "Frankfurt", "Koeln"
var_prop_comb = get_12_var_comb_for_cities(city1, city2)
var_prop_comb.index = var_prop_comb.index + 1
var_prop_comb.to_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv")

# get SHAP values for stage 2
if bagging:
    # get average prices for category A cities in Germany
    cat_A_combinations = get_comb_for_cat_A_cities()
    cat_A_combinations["pred_price"] = pred_model.predict(cat_A_combinations)
    cat_A_combinations.to_csv(path_pre + "Frontend/xai_experiment/immobilien_cat_A_sim_prices.csv")

    # SHAP values
    var_prop_comb = pd.read_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv", index_col=0)
    explainer = shap.Explainer(pred_model, xtrain)
    shap_exp_val = pd.DataFrame(explainer.shap_values(var_prop_comb[xtrain.columns]))
    shap_exp_val.index = var_prop_comb.index
    shap_exp_val.columns = xtrain.columns
    shap_exp_val["Stadt"] = shap_exp_val["Koeln"] + shap_exp_val["Frankfurt"] + shap_exp_val["Stuttgart"] + \
                            shap_exp_val["Muenchen"] + shap_exp_val["Hamburg"] + shap_exp_val["Berlin"] + shap_exp_val[
                                "Duesseldorf"]
    shap_exp_val["pred_minus_base"] = shap_exp_val["garden"] + shap_exp_val["basement"] + shap_exp_val["elevator"] + \
                                      shap_exp_val["balcony"] + shap_exp_val["floor (storey)"] + shap_exp_val["nmbr of rooms"] + \
                                      shap_exp_val["construction year"] + shap_exp_val["unemployment"] + shap_exp_val["Stadt"] + \
                                      shap_exp_val["Anteil Gruenenwaehler"]
    shap_exp_val.to_csv(path_pre + "Frontend/xai_experiment/immobilien_shap.csv")

    # predicted prices by bagged KI (Stage 2)
    prices = var_prop_comb.copy(deep=True)
    pred_prices = pred_model.predict(prices[xtrain.columns])
    prices["predicted"] = pred_prices
    prices.to_csv(path_pre + "Frontend/xai_experiment/immobilien_pred_prices.csv")

    if include_chemnitz:
        # write feature combs of stage 4
        stage_4_var_combs = get_12_var_comb_for_cities("Duesseldorf", "Chemnitz", include_chemnitz=include_chemnitz)
        stage_4_var_combs["pred_price"] = pred_model.predict(stage_4_var_combs)
        if not shap_plot:
            explainer = shap.Explainer(pred_model, xtrain)
        shap_exp_val = pd.DataFrame(explainer.shap_values(stage_4_var_combs[xtrain.columns]))
        shap_exp_val.index = stage_4_var_combs.index
        shap_exp_val.columns = ["shap_" + c for c in xtrain.columns]
        shap_exp_val["shap_Stadt"] = shap_exp_val["shap_Koeln"] + shap_exp_val["shap_Frankfurt"] + shap_exp_val["shap_Stuttgart"] + \
                                shap_exp_val["shap_Muenchen"] + shap_exp_val["shap_Hamburg"] + shap_exp_val["shap_Berlin"] + \
                                shap_exp_val["shap_Duesseldorf"] + shap_exp_val["shap_Chemnitz"]
        stage_4_frame = stage_4_var_combs.join(shap_exp_val)
        stage_4_frame.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_shap.csv")


if not bagging:
    if not include_chemnitz:
        # simulated true prices by unbagged KI WITHOUT chemnitz
        var_prop_comb_randomized = var_prop_comb.copy(deep=True)
        n_len = len(var_prop_comb_randomized)
        var_prop_comb_randomized["floor (storey)"], var_prop_comb_randomized["construction year"] = \
            np.random.choice(floor_range, size=n_len), np.random.choice(year_range, size=n_len)
        var_prop_comb_randomized["true_sim_price"] = pred_model.predict(var_prop_comb_randomized)
        var_prop_comb_randomized.to_csv(path_pre + "Frontend/xai_experiment/immobilien_sim_true_prices.csv")

    # STAGE 4
    else:
        # write feature combs of stage 4
        stage_4_var_combs = get_12_var_comb_for_cities("Duesseldorf", "Chemnitz", include_chemnitz=include_chemnitz)
        stage_4_var_combs.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_features.csv")

        # simulated true prices by unbagged KI WITH chemnitz
        n_len = len(stage_4_var_combs)
        stage_4_var_combs["floor (storey)"], stage_4_var_combs["construction year"] = \
            np.random.choice(floor_range, size=n_len), np.random.choice(year_range, size=n_len)
        stage_4_var_combs["true_sim_price"] = pred_model.predict(stage_4_var_combs)
        stage_4_var_combs.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_sim_true_prices.csv")


# _____________________
# _____________________
# SUBSET FOR EXPERIMENT
# _____________________

# get features and label of subset, get prediction, get overall R^2
if bagging:
    test_subset = test_subset[(test_subset[city1] == 1) | (test_subset[city2] == 1)]
    x_sub, y_sub = test_subset.drop(columns="price"), test_subset["price"]
    y_sub_pred = pred_model.predict(x_sub)

    print(f"Overall R^2 in subset: {r2_score(y_sub, y_sub_pred)}")

    # get unique rows (must be 12 unique)
    pred_diff = abs(y_sub_pred - y_sub) / y_sub_pred
    all_rows = pd.DataFrame(x_sub)
    unq_rows = all_rows.drop_duplicates(subset=None)
    print(f"We have {len(unq_rows)} unique combinations ({len(test_subset)} total) in experimental subset.")

    # find indices for all rows of each unique combination of variable features
    subset_indices = all_rows.groupby(list(all_rows)).apply(lambda x: tuple(x.index)).tolist()

    # find best performing row for each unq comb of var features
    best_pred_indices = [pred_diff[pred_diff.index.isin(i_tuple)].idxmax() for i_tuple in subset_indices]
    best_perf_rows = test_subset[test_subset.index.isin(best_pred_indices)]

    # calculate R^2 for these best-performing rows
    x_sub_best, y_sub_best = best_perf_rows.drop(columns="price"), best_perf_rows["price"]
    y_sub_best_pred = pred_model.predict(x_sub_best)
    print(f"Selected R^2 in subset: {r2_score(y_sub_best, y_sub_best_pred)}")

    # write as ground truth (for incentive) to file
    true_price_instances = test_subset[test_subset.index.isin(best_pred_indices)]
    true_price_instances.to_csv(path_pre + "Frontend/xai_experiment/immobilien_true_prices.csv",
                                index=False)  # mode="a",

    # write
    # # calculate R^2 for previous best-performing and now in test set rows
    # x_exp, y_exp = xtest[xtest.index.isin(experiment_indices)], ytest[ytest.index.isin(experiment_indices)]
    # y_exp_pred = pred_model.predict(x_exp)
    # print(f"Experimental R^2: {r2_score(y_exp, y_exp_pred)}")

    # # show some price stats
    # print(f"Der Durchschnittspreis/QM der fixen Menge (alle {len(y_sub)} Immobilien) beträgt {np.nanmean(y_sub)} EUR.")
    # print(f"Der vorhergesagte Durchschnittspreis/QM der fixen Menge (alle {len(y_sub_pred)} Immobilien) beträgt {np.nanmean(y_sub_pred)} EUR.")
    # print(f"Der Durchschnittspreis/QM der exp. Immobilien (alle {len(y_exp)}) beträgt {np.nanmean(y_exp)} EUR.")
    # print(f"Der vorhergesagte Durchschnittspreis/QM der exp. Immobilien (alle {len(y_exp_pred)}) beträgt {np.nanmean(y_exp_pred)} EUR.")
    #
    # # calculate shap values
    # explainer = shap.Explainer(pred_model, x_exp)
    # shap_exp_val = pd.DataFrame(explainer.shap_values(x_exp))
    # shap_exp_val.index = x_exp.index
    # shap_exp_val.columns = x_exp.columns
    # write_to_file = False
    #
    # prices = pd.DataFrame()
    # prices["true"] = y_exp
    # prices["predicted"] = y_exp_pred
    #
    # if write_to_file:
    #     x_exp.to_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv")
    #     prices.to_csv(path_pre + "Frontend/xai_experiment/immobilien_pred_prices.csv")
    #     shap_exp_val.to_csv(path_pre + "Frontend/xai_experiment/immobilien_shap.csv")
    #
    # np.sum(shap_exp_val.iloc[1]) + prices["predicted"].iloc[1]