import pandas as pd import requests from bs4 import BeautifulSoup import time import random # # user agents to be randomly selected in crawl # HEADER = [ ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}), ({"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}), ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}), ({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"}), ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322)"}), ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"}), ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"}), ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322)"}) ] stadt = "Chemnitz" links = [] for p in range(1, 13): print(f"Looking at page {p}...") url = "https://www.immonet.de/immobiliensuche/sel.do?suchart=2&city=98925&marketingtype=1&pageoffset=1&radius=0&parentcat=1&sortby=0&listsize=26&objecttype=1&" \ "page=" + str(p) # # obtain search results, i.e., lists of product links and product names # webpage = requests.get(url, headers=random.choice(HEADER)) search_soup = BeautifulSoup(webpage.content, "lxml") results = search_soup.find_all("a", {"class": "block ellipsis text-225 text-default"}) for r in range(0, len(results)): links.append(results[r]['href']) time.sleep(10 + random.uniform(0, 5)) link_list = pd.DataFrame() link_list["url"] = ["https://immonet.de" + u for u in links] link_list["ort"] = stadt link_list = link_list.drop_duplicates() link_list.to_csv("C:/Users/morit/PycharmProjects/xAI/Data/WebScraper/link_list_" + stadt.lower() + ".csv", index=False)