import requests, pandas as pd from bs4 import BeautifulSoup from random import randint from time import sleep page = requests.get("https://www.charitynavigator.org/index.cfm?bay=search.alpha") soup = BeautifulSoup(page.content, 'html.parser') bodywrap = soup.find(id="bodywrap") names_tags = bodywrap.select(".charities a") names = [ns.get_text() for ns in names_tags][1:] hrefs = [h.get('href') for h in names_tags] # missions_list = [] i = -1 charities_tabs = [] for href in hrefs[1:300]: # Sleep a random number of seconds (between 1 and 5) sleep(randint(1, 4)) my_dictionary = {} i += 1 print(i) try: page = requests.get(href) soup = BeautifulSoup(page.content, 'html.parser') bodywrap = soup.find(class_='profile-content') header = soup.find(class_='page-header') category_tags = header.select("h3") category = [c.get_text() for c in category_tags] b = category[0].rsplit('|') my_dictionary['Category'] = b[0] my_dictionary['Location'] = b[-1] # mission_tags = bodywrap.select(".not-truncate") # mission = [m.get_text() for m in mission_tags] # print('MISSION:', mission[0]) if mission else print('MISSION:', mission) # print('NAME:', names[i]) report = soup.find(class_='tab-content') tab_tags = report.select(".tab-pane h2") # a = mission[0].replace('\xa0(Less)', '') # a = a.replace(names[i][:-5], 'This Charity') # missions_list.append(a) my_dictionary['Name'] = names[i] # my_dictionary['Mission'] = a for ind in range(0, 14, 2): my_dictionary[tab_tags[ind].get_text()] = tab_tags[ind + 1].get_text().replace('%%years', ' years') charities_tabs.append(my_dictionary) except: # print('DELETING:', names[i]) del names[i] i = i - 1 # print(names[i], 'ADDED') output = pd.DataFrame.from_records(charities_tabs) # print(output.head()) output.to_csv('charities_database.csv', encoding='utf-8-sig')