Data Visualization with matplotlib

Monte Lunacek

Objectives

References

This tutorial based on some of the following excellent content.

Object and Functional Models

Functional

Object-oriented

Caution: redundant interface, namespace issues

Enabling plotting

IPython terminal

ipython --pylab
ipython --matplotlib

IPython notebook

%pylab inline
%matplotlib inline

ipython notebook --pylab=inline
ipython notebook --matplotlib=inline

The funtional pylab interface

In [1]:
#inline to use with notebook (from pylab import *) 
%pylab inline 
Populating the interactive namespace from numpy and matplotlib

In [2]:
# make the plots smaller
rcParams['figure.figsize'] = 8, 4
In [3]:
x = linspace(0, 2*pi, 100)
y = np.sin(x)
plot(x, y)
show()
In [4]:
hist(randn(1000), alpha=0.5, histtype='stepfilled')
hist(0.75*randn(1000)+1, alpha=0.5, histtype='stepfilled') #hist?
show()
In [5]:
#hist?

Quick, easy, simple plots.

Object-oriented pyplot interface

In [1]:
#restart notebook
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [2]:
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 8, 4
In [3]:
plot(range(20))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-d0d4e9792ae1> in <module>()
----> 1 plot(range(20))

NameError: name 'plot' is not defined

Good, that's the error we want to see.

The figures and axes objects

First, we create a blank figure. Then we add a subpot.

In [4]:
x = np.linspace(0, 2*np.pi, 100) #same as before
y = np.sin(x)

fig = plt.figure()
ax = fig.add_subplot(1,1,1) # 1 row, 1 col, graphic 1
ax.plot(x, y)
fig.show()

Multiple subplots

In [5]:
fig = plt.figure()

ax1 = fig.add_subplot(1,2,1) # 1 row, 2 cols, graphic 1
ax2 = fig.add_subplot(1,2,2) # graphic 2

ax1.plot(x, y)

ax2.hist(np.random.randn(1000), alpha=0.5, histtype='stepfilled')
ax2.hist(0.75*np.random.randn(1000)+1, alpha=0.5, histtype='stepfilled')

fig.show()

The plt.subplots() command

In [6]:
fig, ax = plt.subplots(2,3)

ax[0,0].plot(x, y)
ax[0,2].hist(np.random.randn(100), alpha=0.5, color="g")
ax[1,1].scatter(np.random.randn(10), np.random.randn(10), color="r")

fig.show()

plt.plot?

==========  ========
character   color
==========  ========
'b'         blue
'g'         green
'r'         red
'c'         cyan
'm'         magenta
'y'         yellow
'k'         black
'w'         white
==========  ========

The subplot2grid command

In [14]:
fig = plt.figure(figsize=(8,6))
ax1 = plt.subplot2grid((3,3), (0,0), colspan=3)
ax2 = plt.subplot2grid((3,3), (1,0), colspan=2)
ax3 = plt.subplot2grid((3,3), (1,2), rowspan=2)
ax4 = plt.subplot2grid((3,3), (2,0))
ax5 = plt.subplot2grid((3,3), (2,1))
fig.tight_layout()
fig.show()

Sharing axis values

In [8]:
fig, axes = plt.subplots( 3, 1, sharex = True)
for ax in axes:
    ax.set_axis_bgcolor('0.95')
fig.show()
print axes.shape
(3,)

In [9]:
fig, axes = plt.subplots( 2, 2, sharex = True, sharey = True)
plt.subplots_adjust( wspace = 0.1, hspace = 0.1)
fig.show()
print axes.shape
(2, 2)

How about a little d3.js with mpld3?

https://github.com/jakevdp/mpld3

In [10]:
from mpld3 import enable_notebook
enable_notebook()
In [12]:
fig, ax = plt.subplots(1,2, sharey=True, sharex=True)

print ax.shape

ax[0].plot(x, y, color='green')
ax[1].scatter(np.random.randn(10), np.random.randn(10), color='red')

fig.show()
(2,)

Matplotlib Style

Who doesn't like feel and colors of ggplot?

How do we make matplotlib look like this?

Useful exercise (even if you don't appreciate this). References:

The scatter plot

In [20]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
In [21]:
cars = pd.read_csv(os.path.join('data','cars.csv'))
cars.head()
Out[21]:
type mpg cyl disp hp drat wt qsec vs am gear carb
0 MazdaRX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
1 MazdaRX4Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
2 Datsun710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
3 Hornet4Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
4 HornetSportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2

5 rows × 12 columns

In [22]:
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(cars['wt'], cars['mpg'])
fig.show()

Changing style

Check out color brewer and brewer2mpl wiki

In [23]:
import brewer2mpl

color = brewer2mpl.get_map('Set2', 'qualitative', 3).mpl_colors
In [24]:
fig, ax = plt.subplots(figsize=(6,5))
for i, cyl in enumerate([4,6,8]):
    df = cars[cars['cyl'] == cyl]
    ax = plt.scatter(df['wt'], df['mpg'], s=100, alpha=0.95, edgecolor='none', c=color[i])
fig.show()

The beauty of objects

In [25]:
def base_figure():
    fig, ax = plt.subplots(figsize=(6,5))
    for index, cyl in enumerate([4,6,8]):
        df = cars[cars['cyl'] == cyl]
        ax.scatter(df['wt'], df['mpg'], c=color[index], s=100, alpha=0.75, edgecolor='none')
    return fig, ax
In [26]:
fig, ax = base_figure()

ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')  

fig.show()
In [27]:
def remove_ticks(ax):
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    
def remove_splines(ax, spl):
    for s in spl:
        ax.spines[s].set_visible(False)  

def modify_splines(ax, lwd, col):    
    for s in ['bottom', 'left','top','right']:
        ax.spines[s].set_linewidth(lwd)
        ax.spines[s].set_color(col)    
           
In [28]:
fig, ax = base_figure()

remove_ticks(ax)
modify_splines(ax, lwd=0.75, col='0.8')
remove_splines(ax, ['top','right'])

ax.patch.set_facecolor('0.93')
ax.grid(True, 'major', color='0.98', linestyle='-', linewidth=1.0)
ax.set_axisbelow(True)   

fig.show()

Define custom transformations

In [29]:
def ggplot(ax):
    
    remove_ticks(ax)
    modify_splines(ax, lwd=0.75, col='0.8')
    remove_splines(ax, ['top','right'])
    
    ax.patch.set_facecolor('0.93')
    ax.grid(True, 'major', color='0.98', linestyle='-', linewidth=1.0)
    ax.set_axisbelow(True)   
In [30]:
fig, ax = base_figure()
ggplot(ax)
fig.show()

Legends

In [32]:
def base_figure():
    
    fig, ax = plt.subplots(figsize=(6,5))
    for index, cyl in enumerate([4,6,8]):
        df = cars[cars['cyl'] == cyl]
        ax.scatter(df['wt'], 
                   df['mpg'], 
                   c=color[index], 
                   s=100, 
                   alpha=0.75, 
                   edgecolor='none',
                   label='{0} cyl'.format(cyl))  # adding a label
    
    return fig, ax

fig, ax = base_figure()

ax.legend(loc='best')

ggplot(ax)

fig.show()
In [36]:
def nice_legend(ax):
    if ax.legend_ is not None:
        ax.legend_.get_frame().set_linewidth(0)
        ax.legend_.get_frame().set_alpha(0.5)
In [37]:
fig, ax = base_figure()

ax.legend(loc='best', scatterpoints=1) # for a single point

ggplot(ax)
nice_legend(ax)

fig.show()        

Changing your default style

You can add some custom styles in your ~/.matplotlib/matplotlibrc file.

In [38]:
fig, ax = base_figure()
fig.show()

Setting the mpl.rcParams

The default figue size.

In [42]:
mpl.rcParams['figure.figsize'] = 8, 4

Change the axes background color, turn on grid lines, change the color.

In []:
mpl.rcParams['axes.facecolor'] = '0.93'
mpl.rcParams['axes.grid'] = True
mpl.rcParams['grid.linestyle'] = '-'
mpl.rcParams['grid.linewidth'] = 1
mpl.rcParams['grid.color'] = '1.0' 
mpl.rcParams['axes.axisbelow'] = True
mpl.rcParams['axes.linewidth'] = 0.5
mpl.rcParams['axes.edgecolor'] = '0.7' #can't remove some
mpl.rcParams['xtick.major.size'] = 0.0
mpl.rcParams['ytick.major.size'] = 0.0

Modify the legend.

In [45]:
mpl.rcParams['legend.fancybox'] = True
mpl.rcParams['legend.scatterpoints'] = 1
mpl.rcParams['legend.frameon'] = False
In [51]:
fig, ax = base_figure()
ax.legend(loc='best')
fig.show()

And many more options....

In [49]:
mpl.rcParams.keys()[:10]
Out[49]:
['agg.path.chunksize',
 'animation.avconv_args',
 'animation.avconv_path',
 'animation.bitrate',
 'animation.codec',
 'animation.convert_args',
 'animation.convert_path',
 'animation.ffmpeg_args',
 'animation.ffmpeg_path',
 'animation.frame_format']

Let's save that for later...

In [68]:
import json

with open('mplrc.json','w') as output:
    output.write(json.dumps(mpl.rcParams))

Tricks with itertools and functools

In [55]:
fig, ax = plt.subplots(figsize=(6,5))

for index, cyl in enumerate([4,6,8]):
    df = cars[cars['cyl'] == cyl]
    ax.scatter(df['wt'], 
               df['mpg'], 
               c=color[index], 
               s=100, 
               alpha=0.75, 
               edgecolor='none',
               label='{0} cyl'.format(cyl))  # adding a label
    
ax.legend(loc='best')
fig.show()
In [56]:
import itertools
from functools import partial
In [57]:
color_iter = itertools.cycle(color)
partial_scatter = partial(plt.scatter, s=100, alpha=0.75, edgecolor='none')
In [63]:
fig, ax = plt.subplots(figsize=(6,5))
for cyl in [4,6,8]:
    df = cars[cars['cyl'] == cyl]
    
    ax = partial_scatter(df['wt'], df['mpg'], c=next(color_iter), label='{0} cyl'.format(cyl))

fig.show()

How about as a d3 svg?

https://github.com/jakevdp/mpld3

In [67]:
from mpld3 import enable_notebook
enable_notebook()

fig, ax = base_figure()
#ax.legend(loc='best')  # Note quite yet
fig.show()

Examples

In [141]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

A little style from the previous session.

In [142]:
import json
import brewer2mpl

data = json.loads(open('mplrc.json','r').read())
for x in data.keys():
    try:
        mpl.rcParams[x] = data[x]
    except ValueError:
        pass
        
colors = brewer2mpl.get_map('Set1', 'qualitative', 8).mpl_colors
mpl.rcParams['axes.color_cycle'] = colors

The line graph

plt.plot?

In [143]:
fig, ax = plt.subplots()
ax.plot(np.random.randn(200).cumsum())
fig.show()
In [192]:
fig, ax = plt.subplots()
for i in range(8):
    ax.plot(np.random.randn(200).cumsum())
fig.show()

Histogram

?plt.hist

In [145]:
mu = 100
sigma = 15
x = mu + sigma * np.random.randn(300)

fig, ax = plt.subplots()
ax.hist(x, alpha=0.5, bins=20)
fig.show()
In [191]:
fig, ax = plt.subplots()
for i in range(3):
    x = 20.0*np.random.randn() + sigma*np.random.randn(300)
    ax.hist(x, normed=1, alpha=0.5, histtype='stepfilled', bins=20)
fig.show()

Kernel Density Estimates

In [147]:
from sklearn.neighbors.kde import KernelDensity
In [148]:
fig, ax = plt.subplots()
for i in range(3):
    
    data = 20.0*np.random.randn() + sigma*np.random.randn(100)
    x = np.linspace(data.min(), data.max(), 100)

    # For sklearn
    data = data.reshape(-1, 1)
    x = x.reshape(-1, 1)
    
    kde = KernelDensity().fit(data)  # you can adjust the 'bandwidth' parameter
    density = np.exp(kde.score_samples(x))
    ax.plot(x, density)

fig.show()

Scipy.stats

In [149]:
from scipy import stats

fig, ax = plt.subplots()
for i in range(3):
    
    x = 20.0*np.random.randn() + sigma*np.random.randn(300)
    xd = np.linspace(min(x)-10, max(x)+10, 100)
    density = stats.kde.gaussian_kde(x)
    ax.plot(xd, density(xd))
    
fig.show()

fill_between

In [190]:
import itertools
colors = itertools.cycle(mpl.rcParams['axes.color_cycle'])

fig, ax = plt.subplots()
for i in range(4):
    
    x = 20.0*np.random.randn() + sigma*np.random.randn(300)
    xd = np.linspace(min(x)-10, max(x)+10, 100)
    density = stats.kde.gaussian_kde(x)
    
    ax.fill_between(xd, 0, density(xd), alpha=0.25, color=next(colors), linewidth=2)
    
fig.show()

Combined hist and kde

In [189]:
fig, ax = plt.subplots()
for i in range(2):
    
    x = 20.0*np.random.randn() + sigma*np.random.randn(300)
    xd = np.linspace(min(x)-10, max(x)+10, 100)
    density = stats.kde.gaussian_kde(x)
    c = next(colors)
    
    ax.hist(x, normed=1, alpha=0.25, color=c, histtype='stepfilled')
    ax.plot(xd, density(xd), alpha=0.75, color=c, linewidth=2)
    
fig.show()

Bar charts

Adapted from Harvard CS109.

In [152]:
years = [2004, 2005, 2006, 2007, 2008]
heights = [501, 607, 709, 650, 532]
box_colors = mpl.rcParams['axes.color_cycle']  

fig, ax = plt.subplots()

ax.bar(np.array(years)-0.4, heights, color=box_colors, alpha=0.75)

ax.set_xlim(2003.5, 2008.5)
ax.set_ylim(0,800)

for x, y in zip(years, heights):
    plt.annotate('{0}'.format(y), (x, y + 20), ha='center')

fig.show()

The box plot

In [153]:
fig, ax = plt.subplots()

d1 = 20.0*np.random.randn() + sigma*np.random.randn(300)
d2 = 20.0*np.random.randn() + sigma*np.random.randn(300)

data = [d1, d2]
bp = ax.boxplot(data, widths=0.65)

fig.show()

Error bars

In [154]:
x = np.linspace(0, 10, 50)
xerr = np.random.normal(np.sin(x), 0.4)
y = np.sin(x)

fig, ax = plt.subplots()

ax.plot(x, y)
ax.errorbar(x, y, xerr, fmt='.k')

fig.show()
In [155]:
fig, ax = plt.subplots()

ax.plot(x, y)
ad = abs(y-xerr)
ax.fill_between(x, y - ad, y + ad, color='0.5', alpha=0.2)

fig.show()
In [180]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=20000, centers=2, random_state=37, cluster_std=4)
x = X[:,0]
y = X[:,1]

fig, ax = plt.subplots()

ax.plot(x,y, 'o', alpha=0.02)

fig.show()
In [181]:
fig, ax = plt.subplots(figsize=(6,5))

ax.hexbin(x, y, gridsize=20)

fig.show()