# This is a Jupyter Notebook file
# The section above this one is called "markdown"
# This section is Python code (although the #'s indicate comments)
pip3 install --upgrade pip #Good idea, but not required
pip3 install pandas #This is the main tool we're talking about today
pip3 install jupyter #If you'd like to run this inside a jupyter notebook; otherwise, you can enter all of these commands inside of IDLE
Note: if you're using Anaconda instead of vanilla Python, you can skip all of that
jupyter notebook #After you type this, you'll need to start a new Python notebook
# Before we start, we need to import the libraries we're working with into local memory
import pandas as pd # everyone shortens the library name this way (to make it easier to type)
import numpy as np # this is a companion library to pandas that is used for some numeric work
# First way to create a dataframe: list of lists for data with an extra list to define columns:
dn_df = pd.DataFrame([['Donald','Duck'],['Mickey','Mouse'],['Minnie','Mouse']],columns=['First','Last'])
dn_df #Most people put _df at the end of their DataFrames as a reminder
type(dn_df)
# Alternatively, with a list of dictionaries:
pd.DataFrame([{'First':'Donald','Last':'Duck'},{'First':'Mickey','Last':'Mouse'},{'First':'Minnie','Last':'Mouse'}])
# That's fine for small files or for transforming data you have locally; in most instances, we'll probably start with a file
df_df = pd.read_csv('https://s3.amazonaws.com/mattniksch-python-pandas-intro/data_fellows.csv',encoding='latin1')
# Let's look at how big it is and then look at the top part
print(len(df_df))
df_df.head(2)
df_df.columns
# You can use normal Python control structures on the Index object above (it's an "iterable")
for column in df_df.columns:
print('{} is the name of one of the columns in this DataFrame'.format(column))
# Let's add a LastFirst field to our DataFrame:
df_df['LastFirst'] = df_df['Last']+', '+df_df['First']
df_df.tail()
# Use the index_col argument; here index_col=3 and index_col=['Organization'] are equivalent
# Note that I can chain .tail() to the creation of the DataFrame
# You can do this kind of thing a lot in Python, but sometimes it's clearer to use multiple lines
pd.read_csv('https://s3.amazonaws.com/mattniksch-python-pandas-intro/data_fellows.csv',index_col=3, encoding='latin1').tail()
# Organization is probably a weird choice for index; let's change it to LastFirst
df = df_df.set_index('LastFirst')
df.head()
# Entire row:
df.loc['Niksch, Matt']
# Entire column:
df['Organization']
This isn't a simple list, but instead a pandas Series object, which is kind of like the little sibling of a DataFrame
We'll talk about that more in a bit, but as a quick note, here are a few other ways to get a column:
df.Organization #you can skip the brackets if there are no special characters
df.loc[:, 'Organization'] #Similar to the rest of Python, you can use a : to specify "everything" in a dimension
type(df.loc[:,'Organization'])
# Here, I'm first getting a Series and then asking for the value at a specific index
df['Organization']['Niksch, Matt']
# Here, I'm asking for a point in the grid by specifying both axes
df.loc['Niksch, Matt','Gender']
# .loc is used above when you know the values; .iloc is used when you know the location in the grid
# In this example, I'm using the : as the second argument to say I want everything
# Note that Justin is the 4th person in the table above because this is a zero indexed system
df.iloc[3,:]
df.describe()
# That seems like a pretty weak description for all of those columns, but maybe that's because of the types of data we have
df.info()
# .describe() still won't do anything for non-numeric fields when summarizing the whole DF
# However, it should work on the individual Series:
df['Gender'].describe()
df['City'].describe()
# You can sort:
df2 = df.sort_values(['Cohort','Last', 'First'], ascending=[True,False,False])
df2.index
# You can grab a subset of columns:
df2[['Cohort','Organization']]
# Alternatively, you can drop columns:
df2.drop(['Gender','Organization'], axis=1).head(8)
# Note that this doesn't change df2 unless I set inplace=True as one of the arguments
# or rows:
df_test = df2.copy()
df_test.drop(['Wei, Dan']).head(8)
# You can match rows based on values:
df2[df2['Region/State']=='California']
# The above looks a little weird. That's because the inner expression is creating a Series
# of True/False values; you can chain these using Pythons & and | logical operators:
df2[(df2['Cohort']==1) & (df2['First'].str.startswith('N'))]
We're going to figure out the % Female and % Chicago for each cohort
g_count = df2[['Cohort','Gender','City']].groupby(['Gender','Cohort']).count()
g_count
# There are a few things happening here, but notably, the groupby() function created a multi-level index
# Let's play with that a little:
g_count.index
g_count.loc['Female']
g_count.loc['Female'].rename(columns={'City':'# Female'})
g_count_f = g_count.loc['Female'].rename(columns={'City':'# Female'}).T
g_count_f
g_count_m = g_count.loc['Male'].rename(columns={'City':'# Male'})
g_count_m
gender_df = pd.concat([g_count_f, g_count_m.T])
gender_df
gender_df.sum()
percent_female = gender_df.loc['# Female'] / gender_df.sum()
percent_female.name = '% Female'
percent_female
# Now, let's do % Chicago
df3=df2[['Cohort','City']]
percent_chicago = df3[df3['City']=='Chicago'].groupby(['Cohort']).count()/df3.groupby(['Cohort']).count()
percent_chicago.rename(columns={'City':'% Chicago'},inplace=True)
percent_chicago
pd.concat([percent_female, percent_chicago], axis=1)
def platin(word):
"""Applies pig latin translation to a provided word"""
if word[0].lower() in ['a','e','i','o','u']:
return word + 'way'
else:
return word[1].upper()+word[2:]+word[0].lower()+'ay'
def make_pig_latin_phrase(x):
"""This is an 'apply' function for use with pandas"""
first, last, city = x #we'll see below that this was sent as a tuple
return platin(first)+' '+platin(last)+' is from '+platin(city)
df[['First','Last','City']].apply(make_pig_latin_phrase,axis=1)
# A simpler example for apply:
df4 = df.copy()
df4['id'] = df4.index
df4['id'].apply(lambda x: x+' is an index!')
# When working on an index, we use "map"
df4.index.map(lambda x: x+' is an index!')