import sys import xgboost as xgb import shap import random from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score, KFold, GridSearchCV from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt # path (python console vs. terminal) path_pre = "" if len(sys.argv) > 1: if sys.argv[1] == "0": path_pre = "../" from predict_house_prices_methods import * print("Pre-path appended.") else: from Code.predict_house_prices_methods import * # random seed rnd_state = 0 np.random.rand(rnd_state) # flags pred_plot = False shap_plot = True bagging = True include_chemnitz = True tune_fresh = False boosting = False diminish_muc_hh = False if pred_plot: # plot style plt.style.use(["science"]) # get immo data immonet_data = get_immonet_data(path_pre, include_chemnitz) # get zip map for feature engineering: share of green voters and unemployment zip_mapping = get_zip_mapping(path_pre) # only keep data that falls into ZIP codes of cities immonet_data = immonet_data[immonet_data.zip_code.isin(zip_mapping["zips"].values)] # join unemployment ("Unterdurchschnittlich" = 1, "Durchschnittlich" = 2, "Überdurchschnittlich" = 3) immonet_data = immonet_data.join(other=get_zip_features(zip_mapping), on="zip_code") # get model data model_data = immonet_data.drop(columns=["url", "soup", "title", "zip_code", "object_states", "heat_type"]) # "ort", model_data = pd.get_dummies(model_data, columns=['ort']) # rename features model_data.columns = rename_house_features(model_data.columns) # process price to price/qm model_data["price"] = model_data["price"] / model_data["sq_meters"] model_data["price"] = [round(p / 500, 0) * 500 for p in model_data["price"]] model_data.drop(columns="sq_meters", inplace=True) # remove upper outliers model_data = model_data[model_data.price < np.quantile(model_data.price, 0.995)] # remove 0.5% top prices # diminish München / Hamburg if diminish_muc_hh: indices_to_be_removed = model_data[(model_data.Hamburg == 1) | (model_data.Berlin == 1) | (model_data.Muenchen == 1)].index.values indices_to_be_removed = random.choices(indices_to_be_removed, k=int(len(indices_to_be_removed) * 0.5)) model_data = model_data[~model_data.index.isin(indices_to_be_removed)] # do bagging for number of rooms, construction year, floors year_range, floor_range, room_range = range(2012, 2023, 1), [2, 3], [3, 3.5] if bagging: test_subset = find_subset(model_data, year_range, floor_range, room_range) for data in [model_data, test_subset]: data.loc[:, "floor (storey)"] = [1 if f in floor_range else 0 if f < min(floor_range) else 2 for f in data["floor (storey)"]] # data.loc[:, "construction year"] = [1 if y < 1946 else 3 if y >= np.min(year_range) else 2 for y in # data["construction year"]] # data.loc[:, "nmbr of rooms"] = [1 if r < np.min(room_range) else 3 if r > np.max(room_range) else 2 for r in # data["nmbr of rooms"]] data.loc[:, "construction year"] = [ 1 if y < 1942 else 2 if y < 1952 else 3 if y < 1962 else 4 if y < 1972 else 5 if y < 1982 else 6 if y < 1992 else 7 if y < 2002 else 8 if y < 2012 else 9 for y in data["construction year"]] data.loc[:, "nmbr of rooms"] = [int(r) for r in data["nmbr of rooms"]] # remove experimental indices before randomly splitting... experiment_indices = [1353, 1273, 307] train, test = train_test_split(model_data[~model_data.index.isin(experiment_indices)], test_size=0.05, random_state=rnd_state) # ...and append them again afterwards test = test.append(model_data[model_data.index.isin(experiment_indices)]) # data splits train, test = train_test_split(model_data, test_size=0.05, random_state=rnd_state) xtrain, ytrain = train.drop(columns="price"), train["price"] xtest, ytest = test.drop(columns="price"), test["price"] # get XGBoost/RndForest Regressor params = {} add_str = "" if include_chemnitz: add_str = add_str + "_wChem" if bagging: add_str = add_str + "_bagged" if boosting: pred_model = xgb.XGBRegressor(verbosity=0) if tune_fresh: search_grid = {'n_estimators': [250, 500, 1000, 2000], 'max_depth': [1, 2, 4], 'subsample': [.5, .75, 1], 'learning_rate': [.001, 0.01, .1], 'random_state': [rnd_state]} grid_search = GridSearchCV(estimator=pred_model, param_grid=search_grid, n_jobs=3) grid_search.fit(xtrain, ytrain) file = open(path_pre + "Data/Param/real_estate_xgboost_param" + add_str + ".txt", "w+") file.write(str(grid_search.best_params_)) params = grid_search.best_params_ else: params_file = open(path_pre + "Data/Param/real_estate_xgboost_param" + add_str + ".txt", "r") # get search param params = eval(params_file.read()) # save parameters from file else: pred_model = RandomForestRegressor() if tune_fresh: search_grid = {'max_depth': [6, 8, 10, 12, 14], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 2], 'min_samples_split': [8, 10, 12], 'n_estimators': [2000, 3000, 4000], 'random_state': [rnd_state]} grid_search = GridSearchCV(estimator=pred_model, param_grid=search_grid, n_jobs=3) grid_search.fit(xtrain, ytrain) file = open(path_pre + "Data/Param/real_estate_rndforest_param" + add_str + ".txt", "w+") file.write(str(grid_search.best_params_)) params = grid_search.best_params_ else: params_file = open(path_pre + "Data/Param/real_estate_rndforest_param" + add_str + ".txt", "r") # get search param params = eval(params_file.read()) # save parameters from file pred_model.set_params(**params) pred_model.fit(xtrain, ytrain) score = pred_model.score(xtrain, ytrain) print("Training score: ", score) # get k-fold cv average score kfold = KFold(n_splits=5, shuffle=True) kf_cv_scores = cross_val_score(pred_model, xtrain, ytrain, cv=kfold, scoring="r2") print("K-fold CV average score: %.2f" % kf_cv_scores.mean()) # test scores ypred = pred_model.predict(xtest) mse = mean_squared_error(ytest, ypred) r_sq = r2_score(ytest, ypred) print("R^2: %.2f" % r_sq) print("MSE: %.2f" % mse) print("RMSE: %.2f" % (mse ** (1 / 2.0))) if pred_plot: # plot x_ax = range(len(ytest)) plt.plot(x_ax, ytest, label="original") plt.plot(x_ax, ypred, label="predicted") plt.title("Immonet price prediction [$R^2 =$" + str(round(kf_cv_scores.mean(), 2)) + "]") plt.ylabel("Price/$m^2$ [1,000 EUR]") plt.yticks([4000, 8000, 12000, 16000], ["4", "8", "12", "16"]) plt.xlabel("Test samples") plt.legend() plt.tight_layout() plt.savefig(path_pre + "Plots/real_estate_pred" + add_str + ".pdf") # plt.show() plt.close() for c in xtrain.columns: xtrain[c] = xtrain[c].astype(float) xtest[c] = xtest[c].astype(float) if shap_plot: # plt shap values explainer = shap.Explainer(pred_model, xtrain) shap_values = explainer(xtrain, check_additivity=False) # shap.dependence_plot("elevator", shap_values, x) shap.plots.beeswarm(shap_values, show=False) plt.title("Immonet price prediction") plt.tight_layout() plt.savefig(path_pre + "Plots/real_estate_shap" + add_str + ".pdf") # plt.show() plt.close() # cities for stages 1, 2, 3 city1, city2 = "Frankfurt", "Koeln" var_prop_comb = get_12_var_comb_for_cities(city1, city2) var_prop_comb.index = var_prop_comb.index + 1 var_prop_comb.to_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv") # get SHAP values for stage 2 if bagging: # get average prices for category A cities in Germany cat_A_combinations = get_comb_for_cat_A_cities() cat_A_combinations["pred_price"] = pred_model.predict(cat_A_combinations) cat_A_combinations.to_csv(path_pre + "Frontend/xai_experiment/immobilien_cat_A_sim_prices.csv") # SHAP values var_prop_comb = pd.read_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv", index_col=0) explainer = shap.Explainer(pred_model, xtrain) shap_exp_val = pd.DataFrame(explainer.shap_values(var_prop_comb[xtrain.columns])) shap_exp_val.index = var_prop_comb.index shap_exp_val.columns = xtrain.columns shap_exp_val["Stadt"] = shap_exp_val["Koeln"] + shap_exp_val["Frankfurt"] + shap_exp_val["Stuttgart"] + \ shap_exp_val["Muenchen"] + shap_exp_val["Hamburg"] + shap_exp_val["Berlin"] + shap_exp_val[ "Duesseldorf"] shap_exp_val["pred_minus_base"] = shap_exp_val["garden"] + shap_exp_val["basement"] + shap_exp_val["elevator"] + \ shap_exp_val["balcony"] + shap_exp_val["floor (storey)"] + shap_exp_val["nmbr of rooms"] + \ shap_exp_val["construction year"] + shap_exp_val["unemployment"] + shap_exp_val["Stadt"] + \ shap_exp_val["Anteil Gruenenwaehler"] shap_exp_val.to_csv(path_pre + "Frontend/xai_experiment/immobilien_shap.csv") # predicted prices by bagged KI (Stage 2) prices = var_prop_comb.copy(deep=True) pred_prices = pred_model.predict(prices[xtrain.columns]) prices["predicted"] = pred_prices prices.to_csv(path_pre + "Frontend/xai_experiment/immobilien_pred_prices.csv") if include_chemnitz: # write feature combs of stage 4 stage_4_var_combs = get_12_var_comb_for_cities("Duesseldorf", "Chemnitz", include_chemnitz=include_chemnitz) stage_4_var_combs["pred_price"] = pred_model.predict(stage_4_var_combs) if not shap_plot: explainer = shap.Explainer(pred_model, xtrain) shap_exp_val = pd.DataFrame(explainer.shap_values(stage_4_var_combs[xtrain.columns])) shap_exp_val.index = stage_4_var_combs.index shap_exp_val.columns = ["shap_" + c for c in xtrain.columns] shap_exp_val["shap_Stadt"] = shap_exp_val["shap_Koeln"] + shap_exp_val["shap_Frankfurt"] + shap_exp_val["shap_Stuttgart"] + \ shap_exp_val["shap_Muenchen"] + shap_exp_val["shap_Hamburg"] + shap_exp_val["shap_Berlin"] + \ shap_exp_val["shap_Duesseldorf"] + shap_exp_val["shap_Chemnitz"] stage_4_frame = stage_4_var_combs.join(shap_exp_val) stage_4_frame.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_shap.csv") if not bagging: if not include_chemnitz: # simulated true prices by unbagged KI WITHOUT chemnitz var_prop_comb_randomized = var_prop_comb.copy(deep=True) n_len = len(var_prop_comb_randomized) var_prop_comb_randomized["floor (storey)"], var_prop_comb_randomized["construction year"] = \ np.random.choice(floor_range, size=n_len), np.random.choice(year_range, size=n_len) var_prop_comb_randomized["true_sim_price"] = pred_model.predict(var_prop_comb_randomized) var_prop_comb_randomized.to_csv(path_pre + "Frontend/xai_experiment/immobilien_sim_true_prices.csv") # STAGE 4 else: # write feature combs of stage 4 stage_4_var_combs = get_12_var_comb_for_cities("Duesseldorf", "Chemnitz", include_chemnitz=include_chemnitz) stage_4_var_combs.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_features.csv") # simulated true prices by unbagged KI WITH chemnitz n_len = len(stage_4_var_combs) stage_4_var_combs["floor (storey)"], stage_4_var_combs["construction year"] = \ np.random.choice(floor_range, size=n_len), np.random.choice(year_range, size=n_len) stage_4_var_combs["true_sim_price"] = pred_model.predict(stage_4_var_combs) stage_4_var_combs.to_csv(path_pre + "Frontend/xai_experiment/immobilien_stage4_sim_true_prices.csv") # _____________________ # _____________________ # SUBSET FOR EXPERIMENT # _____________________ # get features and label of subset, get prediction, get overall R^2 if bagging: test_subset = test_subset[(test_subset[city1] == 1) | (test_subset[city2] == 1)] x_sub, y_sub = test_subset.drop(columns="price"), test_subset["price"] y_sub_pred = pred_model.predict(x_sub) print(f"Overall R^2 in subset: {r2_score(y_sub, y_sub_pred)}") # get unique rows (must be 12 unique) pred_diff = abs(y_sub_pred - y_sub) / y_sub_pred all_rows = pd.DataFrame(x_sub) unq_rows = all_rows.drop_duplicates(subset=None) print(f"We have {len(unq_rows)} unique combinations ({len(test_subset)} total) in experimental subset.") # find indices for all rows of each unique combination of variable features subset_indices = all_rows.groupby(list(all_rows)).apply(lambda x: tuple(x.index)).tolist() # find best performing row for each unq comb of var features best_pred_indices = [pred_diff[pred_diff.index.isin(i_tuple)].idxmax() for i_tuple in subset_indices] best_perf_rows = test_subset[test_subset.index.isin(best_pred_indices)] # calculate R^2 for these best-performing rows x_sub_best, y_sub_best = best_perf_rows.drop(columns="price"), best_perf_rows["price"] y_sub_best_pred = pred_model.predict(x_sub_best) print(f"Selected R^2 in subset: {r2_score(y_sub_best, y_sub_best_pred)}") # write as ground truth (for incentive) to file true_price_instances = test_subset[test_subset.index.isin(best_pred_indices)] true_price_instances.to_csv(path_pre + "Frontend/xai_experiment/immobilien_true_prices.csv", index=False) # mode="a", # write # # calculate R^2 for previous best-performing and now in test set rows # x_exp, y_exp = xtest[xtest.index.isin(experiment_indices)], ytest[ytest.index.isin(experiment_indices)] # y_exp_pred = pred_model.predict(x_exp) # print(f"Experimental R^2: {r2_score(y_exp, y_exp_pred)}") # # show some price stats # print(f"Der Durchschnittspreis/QM der fixen Menge (alle {len(y_sub)} Immobilien) beträgt {np.nanmean(y_sub)} EUR.") # print(f"Der vorhergesagte Durchschnittspreis/QM der fixen Menge (alle {len(y_sub_pred)} Immobilien) beträgt {np.nanmean(y_sub_pred)} EUR.") # print(f"Der Durchschnittspreis/QM der exp. Immobilien (alle {len(y_exp)}) beträgt {np.nanmean(y_exp)} EUR.") # print(f"Der vorhergesagte Durchschnittspreis/QM der exp. Immobilien (alle {len(y_exp_pred)}) beträgt {np.nanmean(y_exp_pred)} EUR.") # # # calculate shap values # explainer = shap.Explainer(pred_model, x_exp) # shap_exp_val = pd.DataFrame(explainer.shap_values(x_exp)) # shap_exp_val.index = x_exp.index # shap_exp_val.columns = x_exp.columns # write_to_file = False # # prices = pd.DataFrame() # prices["true"] = y_exp # prices["predicted"] = y_exp_pred # # if write_to_file: # x_exp.to_csv(path_pre + "Frontend/xai_experiment/immobilien_features.csv") # prices.to_csv(path_pre + "Frontend/xai_experiment/immobilien_pred_prices.csv") # shap_exp_val.to_csv(path_pre + "Frontend/xai_experiment/immobilien_shap.csv") # # np.sum(shap_exp_val.iloc[1]) + prices["predicted"].iloc[1]