import pandas as pd import numpy as np import requests from bs4 import BeautifulSoup import time import re import random # # user agents to be randomly selected in crawl # # if true, data is requested from website request_fresh_data = True stadt = "Chemnitz" # user agents to rotate HEADER = [ ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}), ({"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}), ({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"}), ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322)"}), ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"}), ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"}), ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322)"}) ] def try_getting_by_id(obj, id, h1=False): try: element = "div" if h1: element = "h1" focal_val = obj.find_all(element, {"id": id}) focal_val = focal_val[0].get_text() focal_val = focal_val.replace("\n", "").replace("\t", "").replace("\xa0m²", "").replace("\xa0€", "") return focal_val except IndexError: return np.nan def try_getting_panel_features(obj): try: garden = "GARTEN" in obj.find("div", {"id": "panelFeatures"}).get_text().upper() basement = "KELLER" in obj.find("div", {"id": "panelFeatures"}).get_text().upper() elevator = "AUFZUG" in obj.find("div", {"id": "panelFeatures"}).get_text().upper() balcony = "BALKON" in obj.find("div", {"id": "panelFeatures"}).get_text().upper() floor_n = re.findall("Etage: (\d+)", obj.find("div", {"id": "panelFeatures"}).get_text()) if len(floor_n) != 1: floor_n = -1 else: floor_n = float(floor_n[0]) return garden, basement, elevator, balcony, floor_n except AttributeError: return False, False, False, False, -2 def try_getting_zip_code(obj): try: return obj.find("monthly-amount", {"class": "monthlyAmountBar"})['zip'] except TypeError: return np.nan # # scrape product pages for each product to get web data for feature engineering # if request_fresh_data: links = pd.read_csv("Data/WebScraper/link_list_" + stadt.lower() + ".csv") counter = 0 pages, parsed_pages = [], [] for l in links["url"]: counter += 1 print(f"Accessing {l}... ({counter}/{len(links)})") webpage = requests.get(l, headers=random.choice(HEADER)) pages.append(webpage) page_soup = BeautifulSoup(webpage.content, "lxml") parsed_pages.append(page_soup) time.sleep(1 + random.uniform(0, 4)) links["soup"] = parsed_pages links.to_csv("Data/RealEstate/immonet_data_" + stadt.lower() + "_raw.csv", index=False) else: links = pd.read_csv("Data/RealEstate/immonet_data_" + stadt.lower() + "_raw.csv") links = links[["url", "soup", "ort"]] links["soup"] = [BeautifulSoup(s) for s in links["soup"]] # initialize lists for feature values n_base_feat = [] base_fact_ids = [] prices, titles, zips = [], [], [] gardens, basements, elevators, balconys, floor_n = [], [], [], [], [] n_rooms, sq_meters, object_states, year_built, heat_type = [], [], [], [], [] # build features from web data scraped for row in links.itertuples(): print(row.url) # get price price = try_getting_by_id(row.soup, "priceid_1") prices.append(float(price)) # get zips code zip_code = try_getting_zip_code(row.soup) zips.append(zip_code) # get panel features garden, basement, elevator, balcony, floor_nr = try_getting_panel_features(row.soup) gardens.append(garden) basements.append(basement) elevators.append(elevator) balconys.append(balcony) floor_n.append(floor_nr) base_facts = row.soup.find_all("div", {"class": "col-xs-12 box-100 bg-white"}) try: base_feat = base_facts[0].find_all("div", {"class": "row list-100"}) except IndexError: n_base_feat.append(np.nan) titles.append(np.nan) n_rooms.append(np.nan) sq_meters.append(np.nan) object_states.append(np.nan) year_built.append(np.nan) heat_type.append(np.nan) continue n_base_feat.append(len(base_facts)) # get title title = try_getting_by_id(row.soup, "expose-headline", h1=True) titles.append(title) # get base features nmbr_of_rooms = try_getting_by_id(base_facts[0], "equipmentid_1") n_rooms.append(float(nmbr_of_rooms)) area_sq_meters = try_getting_by_id(base_facts[0], "areaid_1") sq_meters.append(float(area_sq_meters)) obj_state = try_getting_by_id(base_facts[0], "objectstatecategoryValue") object_states.append(obj_state) focal_year_built = try_getting_by_id(base_facts[0], "yearbuild") year_built.append(float(focal_year_built)) focal_heat_type = try_getting_by_id(row.soup, "heatTypeValue") heat_type.append(focal_heat_type) for dv in base_feat: sub_divs = dv.find_all('div') if len(sub_divs) == 2: try: base_fact_ids.append(sub_divs[1]['id']) except KeyError: base_fact_ids.append("error") # put features in dataframe links["title"] = titles links["zip_code"] = zips links["garden"] = gardens links["basement"] = basements links["elevator"] = elevators links["balcony"] = balconys links["floor_n"] = floor_n links["n_rooms"] = n_rooms links["sq_meters"] = sq_meters links["object_states"] = object_states links["year_built"] = year_built links["heat_type"] = heat_type links["price"] = prices # drop entries without price, number of rooms, or square meters links = links[~links.price.isna()] links = links[~links.n_rooms.isna()] links = links[~links.sq_meters.isna()] links = links[links.floor_n != -2] # excluding pages without panel features # write data to file links.to_csv("Data/RealEstate/immonet_data_" + stadt.lower() + ".csv", index=False) # find subset for experiment def find_subset(model_data, year_range, floor_range, room_range): mask = (~model_data.garden) & (model_data.basement) & (model_data.elevator) & \ (model_data["floor_n"].isin(floor_range)) & (model_data['n_rooms'].isin(room_range)) & \ (model_data['year_built'].isin(year_range)) print(np.sum(mask)) mask_data = model_data[mask] mask_data_by, mask_data_bn = \ mask_data[mask_data["balcony"] == 1], mask_data[mask_data["balcony"] == 0] print(f"Balcony yes: {len(mask_data_by)}") print(f"Balcony no: {len(mask_data_bn)}") return mask_data.copy(deep=True) year_range, floor_range, room_range = range(2012, 2023, 1), [1, 2, 3, 4, 5, 6, 7], [3, 3.5] test_subset = find_subset(links, year_range, floor_range, room_range) links.columns for data in [links, test_subset]: data.loc[:, "floor_n"] = [1 if f in floor_range else 2 for f in data["floor_n"]] data.loc[:, "year_built"] = [1 if y < 1946 else 3 if y >= np.min(year_range) else 2 for y in data["year_built"]] data.loc[:, "n_rooms"] = [1 if r < np.min(room_range) else 3 if r > np.max(room_range) else 2 for r in data["n_rooms"]]