import numpy as np import pandas as pd # get immonet data def get_immonet_data(path_pre, include_chemnitz): immonet_data = pd.read_csv(path_pre + "Data/RealEstate/immonet_data_ffm.csv") immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_koeln.csv")) immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_muenchen.csv")) immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_hamburg.csv")) immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_stuttgart.csv")) immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_duesseldorf.csv")) immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_berlin.csv")) if include_chemnitz: immonet_data = immonet_data.append(pd.read_csv(path_pre + "Data/RealEstate/immonet_data_chemnitz.csv")) immonet_data = immonet_data[ (~immonet_data.n_rooms.isna()) & (~immonet_data.sq_meters.isna()) & (~immonet_data.price.isna()) & (~immonet_data.year_built.isna()) & (~immonet_data.year_built.isna())]\ .reset_index(drop=True) return immonet_data # print out price statistics def print_descriptive_statistics(model_data): print("City\tAverage\tSt.deviation\t0.25 quantile\tMedian\t0.75 quantile") for city in ['Frankfurt am Main', "Muenchen"]: # "Koeln", focal_data = model_data[model_data.ort == city] print( f"{city}\t{len(focal_data)}\t{round(np.average(focal_data.price), 2)}\t{round(np.std(focal_data.price), 2)}" f"\t{round(np.quantile(focal_data.price, 0.25), 2)}\t{round(np.quantile(focal_data.price, 0.5), 2)}\t{round(np.quantile(focal_data.price, 0.75), 2)}") # find subset for experiment def find_subset(model_data, year_range, floor_range, room_range, unemployment=1): mask = (~model_data.garden) & model_data.elevator & model_data.basement &\ (model_data["floor (storey)"].isin(floor_range)) & (model_data['nmbr of rooms'].isin(room_range)) & \ (model_data['construction year'].isin(year_range)) & (model_data['unemployment'] == unemployment) print(f"Instances in subset: {np.sum(mask)}") mask_data = model_data[mask].copy(deep=True) # print availabilities for stadt in ["Frankfurt", "Hamburg", "Koeln", "Berlin", "Duesseldorf", "Stuttgart", "Muenchen", "Chemnitz"]: try: mask_data_stadt = mask_data[mask_data[stadt] == 1] except KeyError: print(f"{stadt} not included.") continue mask_data_stadt_by, mask_data_stadt_bn = \ mask_data_stadt[mask_data_stadt["balcony"] == 1], mask_data_stadt[mask_data_stadt["balcony"] == 0] print(f"{stadt}: Balcony yes: {(mask_data_stadt_by['Anteil Gruenenwaehler'].unique())}") print(f"{stadt}: Balcony no: {(mask_data_stadt_bn['Anteil Gruenenwaehler'].unique())}") return mask_data.copy(deep=True) # zip preprocess zip mapping def get_zip_mapping_for_city(city, zip_mapping): city_list, district, unemployment, zips, green_share = [], [], [], [], [] for row in zip_mapping.itertuples(): for plz in row[2].split(";"): city_list.append(city) district.append(row[1]) unemployment.append(row[3]) green_share.append(row[4]) zips.append(int(plz)) thres_1 = np.nanquantile(unemployment, 0.3333) thres_2 = np.nanquantile(unemployment, 0.6666) unemployment = [1 if u <= thres_1 else 3 if u >= thres_2 else 2 for u in unemployment] thres_1 = np.nanquantile(green_share, 0.3333) thres_2 = np.nanquantile(green_share, 0.6666) green_share = [1 if u <= thres_1 else 3 if u >= thres_2 else 2 for u in green_share] zip_mapping_complete = pd.DataFrame(list(zip(city_list, district, zips, unemployment, green_share)), columns=['city', 'district', 'zips', 'unemployment', 'green_share']) return zip_mapping_complete # get zip mapping for complete data using "get_zip_mapping_for_city" def get_zip_mapping(path_pre): muc_zip_map = pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_muenchen.csv") muc_zip_map["ArbQuote"] = muc_zip_map["Arbeitslose"] / muc_zip_map["Einwohner"] muc_zip_map = muc_zip_map[['Stadtteil', 'PLZ', 'ArbQuote', 'GreenShare21']] zip_mapping = get_zip_mapping_for_city("Muenchen", muc_zip_map) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Koeln", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_koeln.csv"))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Frankfurt", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_ffm.csv"))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Hamburg", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_hamburg.csv"))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Stuttgart", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_stuttgart.csv"))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Duesseldorf", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_duesseldorf.csv"))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Berlin", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_berlin.csv", encoding='latin1'))) zip_mapping = zip_mapping.append( get_zip_mapping_for_city("Chemnitz", pd.read_csv(path_pre + "Data/WebScraper/plz_mapping_chemnitz.csv"))) return zip_mapping # get zip based features (green voters / unemployment) by getting zip helper frame to join with immonet data def get_zip_features(zip_mapping): zip_helper = pd.DataFrame(index=zip_mapping.zips.unique()) zip_helper["unemployment"] = [zip_mapping[zip_mapping.zips == z]["unemployment"].mode()[0] for z in zip_helper.index] zip_helper["green_share"] = [zip_mapping[zip_mapping.zips == z]["green_share"].mode()[0] for z in zip_helper.index] return zip_helper # uses names without tex unfriendly symbols (e.g., '_') def rename_house_features(feature_names): rename_c = {"n_rooms": "nmbr of rooms", "floor_n": "floor (storey)", "year_built": "construction year", "ort_Frankfurt am Main": "Frankfurt", "ort_Muenchen": "Muenchen", "ort_Koeln": "Koeln", "ort_Hamburg": "Hamburg", "ort_Stuttgart": "Stuttgart", "ort_Duesseldorf": "Duesseldorf", "ort_Berlin": "Berlin", "ort_Chemnitz": "Chemnitz", "green_share": "Anteil Gruenenwaehler"} return [rename_c[c] if c in rename_c else c for c in feature_names] # get features for prediction in stages 1, 2, 3 def get_12_var_comb_for_cities(city_1, city_2, include_chemnitz=False): other_cities = ['Berlin', 'Duesseldorf', 'Frankfurt', 'Hamburg', 'Koeln', 'Muenchen', 'Stuttgart'] if include_chemnitz: other_cities.append("Chemnitz") other_cities.remove(city_1) other_cities.remove(city_2) var_prop_combinations = pd.DataFrame() garden, basement, elevator, balcony, floor, nmbr_of_rooms, construction_year, unemployment, \ Anteil_Gruenenwaehler, city_val_1, city_val_2 = \ [], [], [], [], [], [], [], [], [], [], [] for j in range(1, 13): # fixed properties garden.append(False), basement.append(True), elevator.append(True), floor.append(1), \ nmbr_of_rooms.append(3), construction_year.append(9), unemployment.append(1) # variable properties Anteil_Gruenenwaehler.append(int((j + 3) / 4)) city_val_1.append(j % 2) city_val_2.append((j + 1) % 2) balcony.append(int(((j + 1) / 2)) % 2) var_prop_combinations["garden"], var_prop_combinations["basement"], var_prop_combinations["elevator"], \ var_prop_combinations["balcony"], var_prop_combinations["floor (storey)"], var_prop_combinations[ "nmbr of rooms"], \ var_prop_combinations["construction year"], var_prop_combinations["unemployment"], \ var_prop_combinations["Anteil Gruenenwaehler"], var_prop_combinations[city_1], var_prop_combinations[city_2] \ = garden, basement, elevator, balcony, floor, nmbr_of_rooms, construction_year, unemployment, \ Anteil_Gruenenwaehler, city_val_1, city_val_2 for city in other_cities: var_prop_combinations[city] = 0 return var_prop_combinations # get features for average price of category A cities in Germany (for stages 1, 3) def get_comb_for_cat_A_cities(): cities = ['Berlin', 'Duesseldorf', 'Frankfurt', 'Hamburg', 'Koeln', 'Muenchen', 'Stuttgart'] var_prop_combinations = pd.DataFrame() garden, basement, elevator, balcony, floor, nmbr_of_rooms, construction_year, unemployment, \ Anteil_Gruenenwaehler, city = \ [], [], [], [], [], [], [], [], [], [] for j in range(1, 43): # fixed properties garden.append(False), basement.append(True), elevator.append(True), floor.append(1), \ nmbr_of_rooms.append(3), construction_year.append(9), unemployment.append(1) # variable properties Anteil_Gruenenwaehler.append((int((j-1)/2) % 3)+1) city.append(cities[int((j-1)/6)]) balcony.append(j % 2) var_prop_combinations["garden"], var_prop_combinations["basement"], var_prop_combinations["elevator"], \ var_prop_combinations["balcony"], var_prop_combinations["floor (storey)"], var_prop_combinations[ "nmbr of rooms"], \ var_prop_combinations["construction year"], var_prop_combinations["unemployment"], \ var_prop_combinations["Anteil Gruenenwaehler"], var_prop_combinations["city"] \ = garden, basement, elevator, balcony, floor, nmbr_of_rooms, construction_year, unemployment, \ Anteil_Gruenenwaehler, city var_prop_combinations = pd.get_dummies(var_prop_combinations, columns=['city']) var_prop_combinations.columns = [col.replace("city_", "") for col in var_prop_combinations.columns] return var_prop_combinations