import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random

#
# user agents to be randomly selected in crawl
#
HEADER = [
    ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}),
    ({"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"}),
    ({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"}),
    ({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}),
    ({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}),
    ({"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"}),
    ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322)"}),
    ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"}),
    ({"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"}),
    ({"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322)"})
]

stadt = "Chemnitz"
links = []
for p in range(1, 13):
    print(f"Looking at page {p}...")
    url = "https://www.immonet.de/immobiliensuche/sel.do?suchart=2&city=98925&marketingtype=1&pageoffset=1&radius=0&parentcat=1&sortby=0&listsize=26&objecttype=1&" \
          "page=" + str(p)

    #
    # obtain search results, i.e., lists of product links and product names
    #
    webpage = requests.get(url, headers=random.choice(HEADER))
    search_soup = BeautifulSoup(webpage.content, "lxml")
    results = search_soup.find_all("a", {"class": "block ellipsis text-225 text-default"})

    for r in range(0, len(results)):
        links.append(results[r]['href'])

    time.sleep(10 + random.uniform(0, 5))

link_list = pd.DataFrame()
link_list["url"] = ["https://immonet.de" + u for u in links]
link_list["ort"] = stadt
link_list = link_list.drop_duplicates()
link_list.to_csv("C:/Users/morit/PycharmProjects/xAI/Data/WebScraper/link_list_" + stadt.lower() + ".csv", index=False)