{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"___\n",
"\n",
" \n",
"___"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# DataFrames\n",
"\n",
"DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from numpy.random import randn\n",
"np.random.seed(101)"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Selection and Indexing\n",
"\n",
"Let's learn the various methods to grab data from a DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 2.706850\n",
"B 0.651118\n",
"C -2.018168\n",
"D 0.188695\n",
"E 0.190794\n",
"Name: W, dtype: float64"
]
},
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['W']"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W Z\n",
"A 2.706850 0.503826\n",
"B 0.651118 0.605965\n",
"C -2.018168 -0.589001\n",
"D 0.188695 0.955057\n",
"E 0.190794 0.683509"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pass a list of column names\n",
"df[['W','Z']]"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 2.706850\n",
"B 0.651118\n",
"C -2.018168\n",
"D 0.188695\n",
"E 0.190794\n",
"Name: W, dtype: float64"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# SQL Syntax (NOT RECOMMENDED!)\n",
"df.W"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"DataFrame Columns are just Series"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df['W'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Creating a new column:**"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['new'] = df['W'] + df['Y']"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
" new | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
" 3.614819 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
" -0.196959 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
" -1.489355 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
" -0.744542 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
" 2.796762 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z new\n",
"A 2.706850 0.628133 0.907969 0.503826 3.614819\n",
"B 0.651118 -0.319318 -0.848077 0.605965 -0.196959\n",
"C -2.018168 0.740122 0.528813 -0.589001 -1.489355\n",
"D 0.188695 -0.758872 -0.933237 0.955057 -0.744542\n",
"E 0.190794 1.978757 2.605967 0.683509 2.796762"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Removing Columns**"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop('new',axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
" new | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
" 3.614819 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
" -0.196959 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
" -1.489355 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
" -0.744542 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
" 2.796762 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z new\n",
"A 2.706850 0.628133 0.907969 0.503826 3.614819\n",
"B 0.651118 -0.319318 -0.848077 0.605965 -0.196959\n",
"C -2.018168 0.740122 0.528813 -0.589001 -1.489355\n",
"D 0.188695 -0.758872 -0.933237 0.955057 -0.744542\n",
"E 0.190794 1.978757 2.605967 0.683509 2.796762"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Not inplace unless specified!\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.drop('new',axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Can also drop rows this way:"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057"
]
},
"execution_count": 197,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop('E',axis=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Selecting Rows**"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"W 2.706850\n",
"X 0.628133\n",
"Y 0.907969\n",
"Z 0.503826\n",
"Name: A, dtype: float64"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['A']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Or select based off of position instead of label "
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"W -2.018168\n",
"X 0.740122\n",
"Y 0.528813\n",
"Z -0.589001\n",
"Name: C, dtype: float64"
]
},
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Selecting subset of rows and columns **"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"-0.84807698340363147"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['B','Y']"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" Y | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.907969 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.848077 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W Y\n",
"A 2.706850 0.907969\n",
"B 0.651118 -0.848077"
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[['A','B'],['W','Y']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Conditional Selection\n",
"\n",
"An important feature of pandas is conditional selection using bracket notation, very similar to numpy:"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
" B | \n",
" True | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" C | \n",
" False | \n",
" True | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" D | \n",
" True | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" E | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A True True True True\n",
"B True False False True\n",
"C False True True False\n",
"D True False False True\n",
"E True True True True"
]
},
"execution_count": 203,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df>0"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" NaN | \n",
" NaN | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" NaN | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" NaN | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" NaN | \n",
" NaN | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 NaN NaN 0.605965\n",
"C NaN 0.740122 0.528813 NaN\n",
"D 0.188695 NaN NaN 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df>0]"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 205,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['W']>0]"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 0.907969\n",
"B -0.848077\n",
"D -0.933237\n",
"E 2.605967\n",
"Name: Y, dtype: float64"
]
},
"execution_count": 206,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['W']>0]['Y']"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Y | \n",
" X | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 0.907969 | \n",
" 0.628133 | \n",
"
\n",
" \n",
" B | \n",
" -0.848077 | \n",
" -0.319318 | \n",
"
\n",
" \n",
" D | \n",
" -0.933237 | \n",
" -0.758872 | \n",
"
\n",
" \n",
" E | \n",
" 2.605967 | \n",
" 1.978757 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Y X\n",
"A 0.907969 0.628133\n",
"B -0.848077 -0.319318\n",
"D -0.933237 -0.758872\n",
"E 2.605967 1.978757"
]
},
"execution_count": 207,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['W']>0][['Y','X']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For two conditions you can use | and & with parenthesis:"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 208,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[(df['W']>0) & (df['Y'] > 1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## More Index Details\n",
"\n",
"Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"A 2.706850 0.628133 0.907969 0.503826\n",
"B 0.651118 -0.319318 -0.848077 0.605965\n",
"C -2.018168 0.740122 0.528813 -0.589001\n",
"D 0.188695 -0.758872 -0.933237 0.955057\n",
"E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 209,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" 2 | \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" 3 | \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index W X Y Z\n",
"0 A 2.706850 0.628133 0.907969 0.503826\n",
"1 B 0.651118 -0.319318 -0.848077 0.605965\n",
"2 C -2.018168 0.740122 0.528813 -0.589001\n",
"3 D 0.188695 -0.758872 -0.933237 0.955057\n",
"4 E 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 210,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reset to default 0,1...n index\n",
"df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"newind = 'CA NY WY OR CO'.split()"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['States'] = newind"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
" States | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
" CA | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
" NY | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
" WY | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
" OR | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
" CO | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z States\n",
"A 2.706850 0.628133 0.907969 0.503826 CA\n",
"B 0.651118 -0.319318 -0.848077 0.605965 NY\n",
"C -2.018168 0.740122 0.528813 -0.589001 WY\n",
"D 0.188695 -0.758872 -0.933237 0.955057 OR\n",
"E 0.190794 1.978757 2.605967 0.683509 CO"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" States | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" CA | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" NY | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" WY | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" OR | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" CO | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"States \n",
"CA 2.706850 0.628133 0.907969 0.503826\n",
"NY 0.651118 -0.319318 -0.848077 0.605965\n",
"WY -2.018168 0.740122 0.528813 -0.589001\n",
"OR 0.188695 -0.758872 -0.933237 0.955057\n",
"CO 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.set_index('States')"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
" States | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
" CA | \n",
"
\n",
" \n",
" B | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
" NY | \n",
"
\n",
" \n",
" C | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
" WY | \n",
"
\n",
" \n",
" D | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
" OR | \n",
"
\n",
" \n",
" E | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
" CO | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z States\n",
"A 2.706850 0.628133 0.907969 0.503826 CA\n",
"B 0.651118 -0.319318 -0.848077 0.605965 NY\n",
"C -2.018168 0.740122 0.528813 -0.589001 WY\n",
"D 0.188695 -0.758872 -0.933237 0.955057 OR\n",
"E 0.190794 1.978757 2.605967 0.683509 CO"
]
},
"execution_count": 215,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.set_index('States',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" W | \n",
" X | \n",
" Y | \n",
" Z | \n",
"
\n",
" \n",
" States | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" CA | \n",
" 2.706850 | \n",
" 0.628133 | \n",
" 0.907969 | \n",
" 0.503826 | \n",
"
\n",
" \n",
" NY | \n",
" 0.651118 | \n",
" -0.319318 | \n",
" -0.848077 | \n",
" 0.605965 | \n",
"
\n",
" \n",
" WY | \n",
" -2.018168 | \n",
" 0.740122 | \n",
" 0.528813 | \n",
" -0.589001 | \n",
"
\n",
" \n",
" OR | \n",
" 0.188695 | \n",
" -0.758872 | \n",
" -0.933237 | \n",
" 0.955057 | \n",
"
\n",
" \n",
" CO | \n",
" 0.190794 | \n",
" 1.978757 | \n",
" 2.605967 | \n",
" 0.683509 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" W X Y Z\n",
"States \n",
"CA 2.706850 0.628133 0.907969 0.503826\n",
"NY 0.651118 -0.319318 -0.848077 0.605965\n",
"WY -2.018168 0.740122 0.528813 -0.589001\n",
"OR 0.188695 -0.758872 -0.933237 0.955057\n",
"CO 0.190794 1.978757 2.605967 0.683509"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Multi-Index and Index Hierarchy\n",
"\n",
"Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:"
]
},
{
"cell_type": "code",
"execution_count": 253,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Index Levels\n",
"outside = ['G1','G1','G1','G2','G2','G2']\n",
"inside = [1,2,3,1,2,3]\n",
"hier_index = list(zip(outside,inside))\n",
"hier_index = pd.MultiIndex.from_tuples(hier_index)"
]
},
{
"cell_type": "code",
"execution_count": 254,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],\n",
" labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])"
]
},
"execution_count": 254,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hier_index"
]
},
{
"cell_type": "code",
"execution_count": 257,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" G1 | \n",
" 1 | \n",
" 0.153661 | \n",
" 0.167638 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.765930 | \n",
" 0.962299 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.902826 | \n",
" -0.537909 | \n",
"
\n",
" \n",
" G2 | \n",
" 1 | \n",
" -1.549671 | \n",
" 0.435253 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.259904 | \n",
" -0.447898 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.266207 | \n",
" 0.412580 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"G1 1 0.153661 0.167638\n",
" 2 -0.765930 0.962299\n",
" 3 0.902826 -0.537909\n",
"G2 1 -1.549671 0.435253\n",
" 2 1.259904 -0.447898\n",
" 3 0.266207 0.412580"
]
},
"execution_count": 257,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's show how to index this! For index hierarchy we use df.loc[], if this was on the columns axis, you would just use normal bracket notation df[]. Calling one level of the index returns the sub-dataframe:"
]
},
{
"cell_type": "code",
"execution_count": 260,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.153661 | \n",
" 0.167638 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.765930 | \n",
" 0.962299 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.902826 | \n",
" -0.537909 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"1 0.153661 0.167638\n",
"2 -0.765930 0.962299\n",
"3 0.902826 -0.537909"
]
},
"execution_count": 260,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['G1']"
]
},
{
"cell_type": "code",
"execution_count": 263,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 0.153661\n",
"B 0.167638\n",
"Name: 1, dtype: float64"
]
},
"execution_count": 263,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['G1'].loc[1]"
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"FrozenList([None, None])"
]
},
"execution_count": 265,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index.names"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.index.names = ['Group','Num']"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" Group | \n",
" Num | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" G1 | \n",
" 1 | \n",
" 0.153661 | \n",
" 0.167638 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.765930 | \n",
" 0.962299 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.902826 | \n",
" -0.537909 | \n",
"
\n",
" \n",
" G2 | \n",
" 1 | \n",
" -1.549671 | \n",
" 0.435253 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.259904 | \n",
" -0.447898 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.266207 | \n",
" 0.412580 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"Group Num \n",
"G1 1 0.153661 0.167638\n",
" 2 -0.765930 0.962299\n",
" 3 0.902826 -0.537909\n",
"G2 1 -1.549671 0.435253\n",
" 2 1.259904 -0.447898\n",
" 3 0.266207 0.412580"
]
},
"execution_count": 267,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 270,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" Num | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.153661 | \n",
" 0.167638 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.765930 | \n",
" 0.962299 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.902826 | \n",
" -0.537909 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"Num \n",
"1 0.153661 0.167638\n",
"2 -0.765930 0.962299\n",
"3 0.902826 -0.537909"
]
},
"execution_count": 270,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.xs('G1')"
]
},
{
"cell_type": "code",
"execution_count": 271,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 0.153661\n",
"B 0.167638\n",
"Name: (G1, 1), dtype: float64"
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.xs(['G1',1])"
]
},
{
"cell_type": "code",
"execution_count": 273,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" Group | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" G1 | \n",
" 0.153661 | \n",
" 0.167638 | \n",
"
\n",
" \n",
" G2 | \n",
" -1.549671 | \n",
" 0.435253 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"Group \n",
"G1 0.153661 0.167638\n",
"G2 -1.549671 0.435253"
]
},
"execution_count": 273,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.xs(1,level='Num')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Great Job!"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}