import pandas as pd

stackoverflow_monthly = pd.read_csv(
    '../data/stackoverflow.zip', parse_dates=True, index_col='creation_date'
).loc[:'2021-08','pandas':'bokeh'].resample('1ME').sum()
stackoverflow_monthly.sample(5, random_state=1)


import matplotlib_inline
from utils import mpl_svg_config

matplotlib_inline.backend_inline.set_matplotlib_formats(
    'svg', # output images using SVG format
    **mpl_svg_config('section-1') # optional: configure metadata
)


stackoverflow_monthly.matplotlib.plot(
    figsize=(8, 2), xlabel='creation date', ylabel='total questions', 
    title='Matplotlib Questions per Month\n(since the creation of Stack Overflow)'
)

<Axes: title={'center': 'Matplotlib Questions per Month\n(since the creation of Stack Overflow)'}, xlabel='creation date', ylabel='total questions'>


import matplotlib.pyplot as plt


# figsize is determined by rcParams for plt.plot()
plt.plot(stackoverflow_monthly.index, stackoverflow_monthly.matplotlib)

_ = plt.xlabel('creation date')
_ = plt.ylabel('total questions')
_ = plt.title('Matplotlib Questions per Month\n(since the creation of Stack Overflow)')


# creates the Figure and adds a single Axes object
fig, ax = plt.subplots(figsize=(8, 2))

ax.plot(stackoverflow_monthly.index, stackoverflow_monthly.matplotlib)

ax.set_xlabel('creation date')
ax.set_ylabel('total questions')
ax.set_title('Matplotlib Questions per Month\n(since the creation of Stack Overflow)')

Text(0.5, 1.0, 'Matplotlib Questions per Month\n(since the creation of Stack Overflow)')


ax = stackoverflow_monthly.matplotlib.plot(
    figsize=(8, 2), xlabel='creation date', ylabel='total questions', 
    title='Matplotlib Questions per Month\n(since the creation of Stack Overflow)'
)
ax.set_ylim(0, None) # this can also be done with pandas

# hide some of the spines (must be done with Matplotlib)
ax.spines[['top', 'right']].set_visible(False)


stackoverflow_daily = pd.read_csv(
    '../data/stackoverflow.zip', parse_dates=True, index_col='creation_date'
).loc[:,'pandas':'bokeh'].resample('1D').sum()
stackoverflow_daily.tail()


avgs = stackoverflow_daily.rolling('30D').mean()
stds = stackoverflow_daily.rolling('30D').std()

avgs.tail()


fig, ax = plt.subplots(figsize=(8, 2))
ax.plot(avgs.index, avgs.matplotlib)

[<matplotlib.lines.Line2D at 0x135a240e0>]


fig, ax = plt.subplots(figsize=(8, 2))
ax.plot(avgs.index, avgs.matplotlib)
ax.fill_between(
    avgs.index, avgs.matplotlib - 2 * stds.matplotlib, 
    avgs.matplotlib + 2 * stds.matplotlib, alpha=0.25
)

<matplotlib.collections.PolyCollection at 0x135a64c50>


fig, ax = plt.subplots(figsize=(8, 2))
ax.plot(avgs.index, avgs.matplotlib)
ax.fill_between(
    avgs.index, avgs.matplotlib - 2 * stds.matplotlib, 
    avgs.matplotlib + 2 * stds.matplotlib, alpha=0.25
)

ax.set(
    xlabel='creation date', ylabel='total questions', ylim=(0, None),
    title='Rolling 30-Day Average of Matplotlib Questions per Day'
)

ax.spines[['top', 'right']].set_visible(False)


def despine(ax):
    ax.spines[['top', 'right']].set_visible(False)
    return ax

fmt = '[marker][line][color]'


fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(
    stackoverflow_monthly.index,
    stackoverflow_monthly.matplotlib, 
    'ok', label=None, alpha=0.5
)

[<matplotlib.lines.Line2D at 0x13726b170>]


import matplotlib.dates as mdates

x_axis_dates = mdates.date2num(stackoverflow_monthly.index)
x_axis_dates[:5]

array([14152., 14183., 14213., 14244., 14275.])


fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(
    x_axis_dates, stackoverflow_monthly.matplotlib, 
    'ok', label=None, alpha=0.5
)

[<matplotlib.lines.Line2D at 0x135acec90>]

import numpy as np

degree = 1
poly = np.polynomial.Polynomial.fit(
    x_axis_dates, stackoverflow_monthly.matplotlib, degree
)
points = poly.linspace(n=100)  # 100 evenly-spaced points along the domain


import numpy as np

fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(x_axis_dates, stackoverflow_monthly.matplotlib, 'ok', label=None, alpha=0.5)

for degree, linestyle in zip([1, 2], ['solid', 'dashed']):
    poly = np.polynomial.Polynomial.fit(
        x_axis_dates, stackoverflow_monthly.matplotlib, degree
    )
    ax.plot(*poly.linspace(), label=degree, linestyle=linestyle, linewidth=2, alpha=0.9)


def add_best_fit_lines(ax, x, y):
    for degree, linestyle in zip([1, 2], ['solid', 'dashed']):
        poly = np.polynomial.Polynomial.fit(x, y, degree)
        ax.plot(
            *poly.linspace(),
            label=degree,
            linestyle=linestyle,
            linewidth=2,
            alpha=0.9
        )
    return ax


def add_labels(ax, xmin):
    ax.set(
        xlabel='creation date', ylabel='total questions',
        xlim=(xmin, None), ylim=(0, None),
        title='Matplotlib Questions per Month\n(since the creation of Stack Overflow)'
    )
    ax.legend(title='degree') # add legend and give it a title
    return ax


fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(x_axis_dates, stackoverflow_monthly.matplotlib, 'ok', label=None, alpha=0.5)

add_best_fit_lines(ax, x_axis_dates, stackoverflow_monthly.matplotlib)

add_labels(ax, x_axis_dates[0])
despine(ax)

<Axes: title={'center': 'Matplotlib Questions per Month\n(since the creation of Stack Overflow)'}, xlabel='creation date', ylabel='total questions'>

ax.xaxis # access the x-axis
ax.yaxis # access the y-axis

ax.xaxis.set_major_locator(mdates.MonthLocator(interval=16))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b\n%Y'))

from matplotlib import ticker

ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))


from matplotlib import ticker

def format_axes(ax):
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=16))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b\n%Y'))
    ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    return ax


fig, ax = plt.subplots(figsize=(9, 3))
ax.plot(x_axis_dates, stackoverflow_monthly.matplotlib, 'ok', label=None, alpha=0.5)

add_best_fit_lines(ax, x_axis_dates, stackoverflow_monthly.matplotlib)

add_labels(ax, x_axis_dates[0])
despine(ax)
format_axes(ax)

<Axes: title={'center': 'Matplotlib Questions per Month\n(since the creation of Stack Overflow)'}, xlabel='creation date', ylabel='total questions'>


import pandas as pd

weather = pd.read_csv('../data/weather.csv', parse_dates=True, index_col='date')
weather.head()


import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from utils import despine


def solution(data):
    la_tavg = data.query('city == "LA"').TAVG
    nyc_tavg = data.query('city == "NYC"').TAVG

    fig, ax = plt.subplots(figsize=(8, 3))
    ax.plot(la_tavg.index, la_tavg, label='LA')
    ax.plot(nyc_tavg.index, nyc_tavg, label='NYC')

    ax.set(
        title='Average Daily Temperatures', xlim=(la_tavg.index.min(), None),
        ylabel=r'temperature ($^\circ$F)'
    )
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b\n%Y'))

    ax.fill_between(
        la_tavg.index, la_tavg, nyc_tavg, where=nyc_tavg > la_tavg, 
        hatch='///', facecolor='gray', alpha=0.5, label='NYC hotter than LA'
    )
    ax.legend(ncols=3, loc='lower center')
    return despine(ax)


solution(weather)

<Axes: title={'center': 'Average Daily Temperatures'}, ylabel='temperature ($^\\circ$F)'>


subset = stackoverflow_daily.sum().nlargest(4)
top_libraries_monthly = stackoverflow_monthly.reindex(columns=subset.index)
top_libraries_monthly.head()


fig, ax = plt.subplots(figsize=(12, 3))
ax.stackplot(
    mdates.date2num(top_libraries_monthly.index),
    top_libraries_monthly.to_numpy().T, # each element is a library's time series
    labels=top_libraries_monthly.columns
)

[<matplotlib.collections.PolyCollection at 0x158049670>,
 <matplotlib.collections.PolyCollection at 0x1592644a0>,
 <matplotlib.collections.PolyCollection at 0x1592d9a30>,
 <matplotlib.collections.PolyCollection at 0x1592d9be0>]


fig, ax = plt.subplots(figsize=(12, 3))
ax.stackplot(
    mdates.date2num(top_libraries_monthly.index), top_libraries_monthly.to_numpy().T, 
    labels=top_libraries_monthly.columns
)
ax.set(xlabel='', ylabel='tagged questions', title='Stack Overflow Questions per Month')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.yaxis.set_major_formatter(ticker.EngFormatter())
despine(ax)

<Axes: title={'center': 'Stack Overflow Questions per Month'}, ylabel='tagged questions'>


def area_plot(data):
    fig, ax = plt.subplots(figsize=(12, 3))
    ax.stackplot(
        mdates.date2num(data.index),
        data.to_numpy().T, 
        labels=data.columns
    )
    ax.set(
        xlabel='', ylabel='tagged questions',
        title='Stack Overflow Questions per Month'
    )
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
    return ax


def annotate(ax, data):
    total = 0
    last_day = data.index.max()
    for area in ax.collections:
        library = area.get_label()
        last_value = data.loc[last_day, library]
        if library != 'seaborn':
            kwargs = {}
        else:
            kwargs = dict(
                xytext=(last_day + pd.Timedelta(days=20), (last_value + total) * 1.1),
                arrowprops=dict(arrowstyle='->')
            )

        ax.annotate(
            f' {library}: {data.loc["2021", library].median():,.0f}',
            xy=(last_day, last_value / 2 + total), ha='left', va='center', **kwargs
        )
        total += last_value
    return ax


ax = area_plot(top_libraries_monthly)
annotate(ax, top_libraries_monthly)

<Axes: title={'center': 'Stack Overflow Questions per Month'}, ylabel='tagged questions'>


def area_plot(data):
    fig, ax = plt.subplots(figsize=(12, 3))
    ax.stackplot(
        mdates.date2num(data.index),
        data.to_numpy().T, 
        labels=data.columns
    )
    ax.set(
        xlabel='', ylabel='tagged questions',
        title='Stack Overflow Questions per Month'
    )
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
    annotate(ax, data)
    return ax


import datetime as dt

ax = area_plot(top_libraries_monthly)
    
# mark when seaborn was created
seaborn_released = dt.date(2013, 10, 28)
ax.axvline(seaborn_released, ymax=0.6, color='gray', linestyle='dashed')
ax.annotate('seaborn v0.1', xy=(seaborn_released, 4750), rotation=-90, va='top')

Text(2013-10-28, 4750, 'seaborn v0.1')


ax = area_plot(top_libraries_monthly)

seaborn_released = dt.date(2013, 10, 28)
ax.axvline(seaborn_released, ymax=0.6, color='gray', linestyle='dashed')
ax.annotate('seaborn v0.1', xy=(seaborn_released, 4750), rotation=-90, va='top')

# oldest question tagged "seaborn"
first_seaborn_qs = top_libraries_monthly.query('seaborn >= 1')\
    .index[0].to_pydatetime().date()
ax.axvline(first_seaborn_qs, ymax=0.6, color='gray', linestyle='dashed')

<matplotlib.lines.Line2D at 0x137d83c20>


def add_reflines(ax, data):
    seaborn_released = dt.date(2013, 10, 28)
    ax.axvline(seaborn_released, ymax=0.6, color='gray', linestyle='dashed')
    ax.annotate('seaborn v0.1', xy=(seaborn_released, 4750), rotation=-90, va='top')

    first_seaborn_qs = \
        data.query('seaborn >= 1').index[0].to_pydatetime().date()
    ax.axvline(first_seaborn_qs, ymax=0.6, color='gray', linestyle='dashed')
    return ax


ax = area_plot(top_libraries_monthly)
add_reflines(ax, top_libraries_monthly)

# shade the region of posts that were retroactively tagged "seaborn"
ax.axvspan(
    ymax=0.6, xmin=mdates.date2num(first_seaborn_qs),
    xmax=mdates.date2num(seaborn_released), color='gray', alpha=0.25
)
middle = (seaborn_released - first_seaborn_qs) / 2 + first_seaborn_qs
ax.annotate(
    'posts retroactively\ntagged "seaborn"', 
    xy=(mdates.date2num(middle), 3500), 
    va='top', ha='center'
)

Text(15505.0, 3500, 'posts retroactively\ntagged "seaborn"')


def area_plot(data):
    fig, ax = plt.subplots(figsize=(12, 3))
    ax.stackplot(
        mdates.date2num(data.index),
        data.to_numpy().T, 
        labels=data.columns
    )
    ax.set(
        xlabel='', ylabel='tagged questions',
        title='Stack Overflow Questions per Month'
    )
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
    annotate(ax, data)
    return fig, ax


fig, ax = area_plot(top_libraries_monthly)
inset_ax = fig.add_axes([0.2, 0.6, 0.2, 0.2])


fig, ax = area_plot(top_libraries_monthly)
inset_ax = fig.add_axes([0.2, 0.6, 0.2, 0.2])
colors = {area.get_label(): area.get_facecolor() for area in ax.collections}

# populate the inset with a bar plot of total questions
total_qs = top_libraries_monthly.sum()
inset_ax.barh(
    total_qs.index, total_qs.to_numpy(),
    color=[colors[label] for label in total_qs.index]
)
inset_ax.yaxis.set_inverted(True) # sort bars in descending order


fig, ax = area_plot(top_libraries_monthly)
inset_ax = fig.add_axes([0.2, 0.6, 0.2, 0.2])
colors = {area.get_label(): area.get_facecolor() for area in ax.collections}

total_qs = top_libraries_monthly.sum()
inset_ax.barh(
    total_qs.index, total_qs.to_numpy(), 
    color=[colors[label] for label in total_qs.index]
)
inset_ax.yaxis.set_inverted(True)
despine(inset_ax)
inset_ax.xaxis.set_major_formatter(ticker.EngFormatter())
inset_ax.set_xlabel('total questions')

Text(0.5, 0, 'total questions')


questions_per_library = pd.read_csv(
    '../data/stackoverflow.zip', parse_dates=True, index_col='creation_date'
).loc[:,'pandas':'bokeh'].sum().sort_values()
questions_per_library

yellowbrick        54
geoviews           63
hvplot             92
holoviews         542
vega              598
altair            804
geopandas        1652
bokeh            4386
seaborn          7414
matplotlib      60554
numpy           93797
pandas         214919
dtype: int64


fig, ax = plt.subplots(figsize=(7, 4))
ax.barh(questions_per_library.index, questions_per_library.to_numpy())

<BarContainer object of 12 artists>

ax.set_xscale('log')
ax.set(xscale='log')


fig, ax = plt.subplots(figsize=(7, 4))
ax.barh(questions_per_library.index, questions_per_library.to_numpy())
ax.set(xlabel='total questions', xscale='log')
despine(ax)

<Axes: xlabel='total questions'>


def annotate_bars(ax):
    for bar in ax.patches:
        x, y = bar.get_xy()
        ax.text(
            x + bar.get_width(), y + bar.get_height()/2, f'{bar.get_width():,d} ',
            va='center', ha='right', color='white'
        )
    return ax


fig, ax = plt.subplots(figsize=(7, 4))
ax.barh(questions_per_library.index, questions_per_library.to_numpy())
ax.set(xlabel='total questions', xscale='log')
despine(ax)
annotate_bars(ax)

<Axes: xlabel='total questions'>


import pandas as pd

weather = pd.read_csv('../data/weather.csv', parse_dates=True, index_col='date')
monthly_seattle_prcp = weather.query('city == "Seattle"').resample('1MS').PRCP.sum()
monthly_seattle_prcp.head()

date
2020-01-01    9.17
2020-02-01    4.05
2020-03-01    3.17
2020-04-01    1.70
2020-05-01    3.11
Freq: MS, Name: PRCP, dtype: float64


import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from utils import despine


def solution(data):
    fig, ax = plt.subplots(figsize=(7, 3))
    ax.bar(data.index, data.to_numpy(), width=20)

    for bar in ax.patches:
        x, y = bar.get_xy()
        ax.text(
            x + bar.get_width()/2, y + bar.get_height(), f'{bar.get_height():.1f}',
            va='bottom', ha='center', color='black'
        )

    ax.set(title='Seattle Monthly Precipitation', ylabel='total precipitation (inches)')
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b\n%Y'))
    return despine(ax)


solution(monthly_seattle_prcp)

<Axes: title={'center': 'Seattle Monthly Precipitation'}, ylabel='total precipitation (inches)'>


co_occur = pd.read_csv(
    '../data/stackoverflow_tag_co_occurrences.csv',
    index_col='library'
)
co_occur.style.background_gradient(axis=None, vmin=0, vmax=1)


co_occur.sum(axis=1)

library
hvplot        0.880435
geoviews      0.809524
seaborn       0.717696
holoviews     0.625461
geopandas     0.399516
matplotlib    0.285233
numpy         0.225199
bokeh         0.170999
pandas        0.123107
dtype: float64


def stacked_bars(data):
    fig, ax = plt.subplots(figsize=(6, 3))
    libraries = data.index

    last = 0
    for library in libraries:
        co_occurring_library = data[library]
        ax.barh(libraries, co_occurring_library, label=library, left=last)
        last += co_occurring_library
    
    ax.yaxis.set_inverted(True)
    return despine(ax)


ax = stacked_bars(co_occur)


ax = stacked_bars(co_occur)
ax.legend(bbox_to_anchor=(1.35, 0.5), loc='center right', framealpha=0.5)

<matplotlib.legend.Legend at 0x1371247a0>


ax = stacked_bars(co_occur)
ax.legend(bbox_to_anchor=(1.35, 0.5), loc='center right', framealpha=0.5)
ax.set(xlabel='percentage of questions with co-occurrences', xlim=(0, 1))
ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))


def stacked_bars(data):
    fig, ax = plt.subplots(figsize=(6, 3))
    libraries = data.index

    last = 0
    for library in libraries:
        co_occurring_library = data[library]
        ax.barh(libraries, co_occurring_library, label=library, left=last)
        last += co_occurring_library
    
    ax.yaxis.set_inverted(True)
    ax.legend(bbox_to_anchor=(1.35, 0.5), loc='center right', framealpha=0.5)
    ax.set(xlabel='percentage of questions with co-occurrences', xlim=(0, 1))
    ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
    return despine(ax)


ax = stacked_bars(co_occur)

for patch in ax.patches:
    width = patch.get_width()
    if width > .09:
        ax.text(
            patch.get_x() + width/2, patch.get_y() + patch.get_height()/2,
            f'{width:.1%}', va='center', ha='center', color='ivory', fontsize=11
        )


def annotate_bars(ax, threshold):
    for patch in ax.patches:
        width = patch.get_width()
        if width > threshold:
            ax.text(
                patch.get_x() + width/2, patch.get_y() + patch.get_height()/2,
                f'{width:.1%}', va='center', ha='center', color='ivory', fontsize=11
            )
    return ax


import matplotlib as mpl

cmap = mpl.colormaps['tab10'].reversed()


[cmap(i) for i in range(10)]

[(0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0),
 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0),
 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0),
 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0),
 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0),
 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0),
 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0),
 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0),
 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0),
 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0)]


def stacked_bars(data, cmap):
    fig, ax = plt.subplots(figsize=(6, 3))
    libraries = data.index

    last = 0
    for i, library in enumerate(libraries):
        co_occurring_library = data[library]
        ax.barh(
            libraries, co_occurring_library, 
            label=library, left=last, color=cmap(i)
        )
        last += co_occurring_library
    
    ax.yaxis.set_inverted(True)
    ax.legend(bbox_to_anchor=(1.35, 0.5), loc='center right', framealpha=0.5)
    ax.set(xlabel='percentage of questions with co-occurrences', xlim=(0, 1))
    ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
    return despine(ax)


ax = stacked_bars(co_occur, cmap)
annotate_bars(ax, threshold=0.09)

<Axes: xlabel='percentage of questions with co-occurrences'>


import pandas as pd

weather = pd.read_csv('../data/weather.csv', parse_dates=True, index_col='date')
quarterly_prcp = weather.pivot(columns='city', values='PRCP').resample('QE').sum()
quarterly_prcp


import matplotlib.pyplot as plt
from utils import despine


def solution(data):
    fig, ax = plt.subplots(figsize=(6, 3))
    total_prcp = data.sum().sort_values()
    sort_order = total_prcp.index

    last = 0
    for quarter_end in data.index:
        prcp = data.loc[quarter_end, sort_order]
        ax.barh(
            sort_order, prcp, label=f'Q{quarter_end.quarter}',
            left=last, alpha=0.8
        )
        last += prcp

    ax.set_xlabel('2020 total precipitation (inches)') 
    ax.set_title('Total Precipitation per City in 2020', y=1.1)
    ax.axvline(total_prcp['Seattle'], linestyle='--', color='gray')
    ax.legend(bbox_to_anchor=(0.5, 1.15), loc='upper center', ncols=4, frameon=False)
    return despine(ax)


solution(quarterly_prcp)

<Axes: title={'center': 'Total Precipitation per City in 2020'}, xlabel='2020 total precipitation (inches)'>


subway = pd.read_csv(
    '../data/NYC_subway_daily.csv', parse_dates=['Datetime'], 
    index_col=['Borough', 'Datetime']
)
subway_daily = subway.unstack(0)
subway_daily.head()


fig, ax = plt.subplots(figsize=(6, 3))
ax.hist(subway_daily.loc['2018', 'Entries']['M'], ec='black')

(array([  4.,  35.,  40.,  34.,   4.,   3.,   7.,  32.,  98., 108.]),
 array([ 857859. , 1087318.1, 1316777.2, 1546236.3, 1775695.4, 2005154.5,
        2234613.6, 2464072.7, 2693531.8, 2922990.9, 3152450. ]),
 <BarContainer object of 10 artists>)


fig, ax = plt.subplots(figsize=(6, 3))
ax.hist(subway_daily.loc['2018', 'Entries']['M'], ec='black')
ax.set(
    xlabel='Entries', ylabel='Frequency',
    title='Histogram of Daily Subway Entries in Manhattan'
)
ax.xaxis.set_major_formatter(ticker.EngFormatter())
despine(ax)

<Axes: title={'center': 'Histogram of Daily Subway Entries in Manhattan'}, xlabel='Entries', ylabel='Frequency'>


weekday_mask = subway_daily.index.weekday < 5
weekday_mask

array([False, False,  True, ...,  True,  True,  True])


fig, axes = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=False)

for ax, mask in zip(axes, [~weekday_mask, weekday_mask]):
    ax.hist(subway_daily[mask].loc['2018', 'Entries']['M'], ec='black')


fig, axes = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=False)

for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.hist(subway_daily[mask].loc['2018', 'Entries']['M'], ec='black')
    ax.set_xlabel(f'Entries per {label}')
    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('Frequency')

Text(0, 0.5, 'Frequency')


fig, axes = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=False)

for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.hist(subway_daily[mask].loc['2018', 'Entries']['M'], ec='black')
    ax.set_xlabel(f'Entries per {label}')
    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('Frequency')
fig.suptitle('Histogram of Daily Subway Entries in Manhattan')

Text(0.5, 0.98, 'Histogram of Daily Subway Entries in Manhattan')


def subway_histogram(subway_daily, weekday_mask):
    fig, axes = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=False)

    for ax, mask, label in zip(
        axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']
    ):
        ax.hist(subway_daily[mask].loc['2018', 'Entries']['M'], ec='black')
        ax.set_xlabel(f'Entries per {label}')
        ax.xaxis.set_major_formatter(ticker.EngFormatter())

    axes[0].set_ylabel('Frequency')
    fig.suptitle('Histogram of Daily Subway Entries in Manhattan')
    
    return fig, axes


with plt.style.context('seaborn-v0_8-darkgrid'):
    subway_histogram(subway_daily, weekday_mask)


with open('../style_tweaks.mplstyle', 'r') as style_sheet:
    print(style_sheet.read())

xtick.major.size: 3
ytick.major.size: 3


with plt.style.context(['seaborn-v0_8-darkgrid', '../style_tweaks.mplstyle']):
    subway_histogram(subway_daily, weekday_mask)


fig, axes = plt.subplots(1, 2, figsize=(6, 2.5), sharey=True)
for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.boxplot(subway_daily[mask].loc['2018', 'Entries']['M'])
    ax.set_xlabel(label)
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('daily subway entries')
fig.suptitle('Box Plot of Daily Subway Entries in Manhattan')

Text(0.5, 0.98, 'Box Plot of Daily Subway Entries in Manhattan')


fig, axes = plt.subplots(1, 2, figsize=(6, 2.5), sharey=True)
for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.boxplot(subway_daily[mask].loc['2018', 'Entries']['M'])
    ax.set_xticklabels([label]) # label the ticks instead of the axis this time
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('daily subway entries')
fig.suptitle('Box Plot of Daily Subway Entries in Manhattan')

Text(0.5, 0.98, 'Box Plot of Daily Subway Entries in Manhattan')


fig, axes = plt.subplots(1, 2, figsize=(6, 2.5), sharey=True)
for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.boxplot(subway_daily[mask].loc['2018', 'Entries']['M'])
    ax.set_xticklabels([label])
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('daily subway entries')
fig.suptitle('Box Plot of Daily Subway Entries in Manhattan')
fig.tight_layout()


fig, axes = plt.subplots(1, 2, figsize=(6, 2.5), sharey=True, layout='constrained')
for ax, mask, label in zip(axes, [~weekday_mask, weekday_mask], ['Weekend', 'Weekday']):
    ax.boxplot(subway_daily[mask].loc['2018', 'Entries']['M'])
    ax.set_xticklabels([label])
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
axes[0].set_ylabel('daily subway entries')
fig.suptitle('Box Plot of Daily Subway Entries in Manhattan')

Text(0.5, 0.98, 'Box Plot of Daily Subway Entries in Manhattan')


import pandas as pd

weather = pd.read_csv('../data/weather.csv', parse_dates=True, index_col='date')
weather.head()


from itertools import zip_longest
import matplotlib.pyplot as plt
from utils import despine


def solution(data):
    fig, axes = plt.subplots(
        2, 6, figsize=(8, 3), sharex=True, sharey=True, layout='constrained'
    )
    for city, ax in zip_longest(data.city.unique(), axes.flatten()):
        if city:
            ax.hist(data.query(f'city == "{city}"').AWND, ec='black')
            ax.set_title(city)
            despine(ax)
        else:
            ax.remove()

    fig.supxlabel('daily average wind (MPH)')
    fig.supylabel('frequency')
    fig.suptitle('Daily Average Wind per City in 2020')
    return fig, axes


_ = solution(weather)


import pandas as pd

questions_per_library = pd.read_csv(
    '../data/stackoverflow.zip', parse_dates=True, index_col='creation_date'
).loc[:,'pandas':'bokeh'].resample('1ME').sum().cumsum().reindex(
    pd.date_range('2008-08', '2021-10', freq='ME')
).fillna(0)
questions_per_library.tail()


from matplotlib.animation import FuncAnimation


import matplotlib.pyplot as plt
from matplotlib import ticker
from utils import despine


def bar_plot(data):
    fig, ax = plt.subplots(figsize=(6, 4), layout='constrained')
    sort_order = data.loc[data.index.max()].squeeze().sort_values().index
    bars = ax.barh(sort_order, [0] * data.shape[1], label=sort_order)
    
    ax.set_xlabel('total questions', fontweight='bold')
    ax.set_xlim(0, 250_000)
    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    ax.xaxis.set_tick_params(labelsize=11)
    ax.yaxis.set_tick_params(labelsize=11)
    despine(ax)

    return fig, ax


import matplotlib_inline
from utils import mpl_svg_config

matplotlib_inline.backend_inline.set_matplotlib_formats(
    'svg', **mpl_svg_config('section-2')
)
bar_plot(questions_per_library)

(<Figure size 600x400 with 1 Axes>, <Axes: xlabel='total questions'>)


def generate_plot_text(ax):
    annotations = [
        ax.annotate(
            '', xy=(0, bar.get_y() + bar.get_height() / 2), ha='left', va='center'
        )
        for bar in ax.patches
    ]

    time_text = ax.text(
        0.9, 0.1, '', transform=ax.transAxes, fontsize=15, ha='center', va='center'
    )
    return annotations, time_text


def update(frame, *, ax, df, annotations, time_text):
    data = df.loc[frame, :]
    
    # update bars
    for rect, text in zip(ax.patches, annotations):
        col = rect.get_label()
        if data[col]:
            rect.set_width(data[col])
            text.set_x(data[col])
            text.set_text(f' {data[col]:,.0f}')

    # update time
    time_text.set_text(frame.strftime('%b\n%Y'))


from functools import partial

def bar_plot_init(questions_per_library):
    fig, ax = bar_plot(questions_per_library)
    annotations, time_text = generate_plot_text(ax)

    bar_plot_update = partial(
        update, ax=ax, df=questions_per_library,
        annotations=annotations, time_text=time_text
    )

    return fig, bar_plot_update


fig, update_func = bar_plot_init(questions_per_library)

ani = FuncAnimation(
    fig, update_func, frames=questions_per_library.index, repeat=False
)
ani.save(
    '../media/stackoverflow_questions.mp4', 
    writer='ffmpeg', fps=10, bitrate=100, dpi=300
)
plt.close()


from IPython import display

display.Video(
    '../media/stackoverflow_questions.mp4', width=600, height=400,
    embed=True, html_attributes='controls muted autoplay'
)


subway = pd.read_csv(
    '../data/NYC_subway_daily.csv', parse_dates=['Datetime'], 
    index_col=['Borough', 'Datetime']
)
subway_daily = subway.unstack(0)
subway_daily.head()


manhattan_entries = subway_daily['Entries']['M']


import numpy as np

count_per_bin, bin_ranges = np.histogram(manhattan_entries, bins=30)


def subway_histogram(data, bins, date_range):
    _, bin_ranges = np.histogram(data, bins=bins)

    weekday_mask = data.index.weekday < 5
    configs = [
        {'label': 'Weekend', 'mask': ~weekday_mask, 'ymax': 60},
        {'label': 'Weekday', 'mask': weekday_mask, 'ymax': 120}
    ]
    
    fig, axes = plt.subplots(1, 2, figsize=(6, 3), sharex=True, layout='constrained')
    for ax, config in zip(axes, configs):
        _, _, config['hist'] = ax.hist(
            data[config['mask']].loc[date_range], bin_ranges, ec='black'
        )
        ax.xaxis.set_major_formatter(ticker.EngFormatter())
        ax.set(
            xlim=(0, None), ylim=(0, config['ymax']),
            xlabel=f'{config["label"]} Entries'
        )
        despine(ax)

    axes[0].set_ylabel('Frequency')
    fig.suptitle('Histogram of Daily Subway Entries in Manhattan')

    return fig, axes, bin_ranges, configs


_ = subway_histogram(manhattan_entries, bins=30, date_range='2017')


def add_time_text(ax):
    time_text = ax.text(
        0.15, 0.9, '', transform=ax.transAxes,
        fontsize=12, ha='center', va='center'
    )
    return time_text


def update(frame, *, data, configs, time_text, bin_ranges):
    artists = []

    time = frame.strftime('%b\n%Y')
    if time != time_text.get_text():
        time_text.set_text(time)
        artists.append(time_text)

    for config in configs:
        time_frame_mask = \
            (data.index > frame - pd.Timedelta(days=365)) & (data.index <= frame)
        counts, _ = np.histogram(
            data[time_frame_mask & config['mask']],
            bin_ranges
        )
        for count, rect in zip(counts, config['hist'].patches):
            if count != rect.get_height():
                rect.set_height(count)
                artists.append(rect)

    return artists


def histogram_init(data, bins, initial_date_range):
    fig, axes, bin_ranges, configs = subway_histogram(data, bins, initial_date_range)

    update_func = partial(
        update, data=data, configs=configs,
        time_text=add_time_text(axes[0]),
        bin_ranges=bin_ranges
    )

    return fig, update_func


fig, update_func = histogram_init(
    manhattan_entries, bins=30, initial_date_range=slice('2017', '2019-07')
)

ani = FuncAnimation(
    fig, update_func, frames=manhattan_entries['2019-08':'2021'].index,
    repeat=False, blit=True
)
ani.save(
    '../media/subway_entries_subplots.mp4',
    writer='ffmpeg', fps=30, bitrate=500, dpi=300
)
plt.close()


from IPython import display

display.Video(
    '../media/subway_entries_subplots.mp4', width=600, height=300,
    embed=True, html_attributes='controls muted autoplay'
)


import pandas as pd

manhattan_entries = pd.read_csv(
    '../data/NYC_subway_daily.csv', parse_dates=['Datetime'], 
    index_col=['Borough', 'Datetime']
).unstack(0)['Entries']['M']
manhattan_entries.head()

Datetime
2017-02-04    1390496.0
2017-02-05    1232537.0
2017-02-06    2774016.0
2017-02-07    2892462.0
2017-02-08    2998897.0
Name: M, dtype: float64


from functools import partial

from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
from matplotlib import ticker
import numpy as np

from utils import despine


def subway_histogram(data, bins, date_range):
    _, bin_ranges = np.histogram(data, bins=bins)
    
    weekday_mask = data.index.weekday < 5
    configs = [ # CHANGE: add bar color to config
        {'label': 'Weekend', 'mask': ~weekday_mask, 'color': 'green'},
        {'label': 'Weekday', 'mask': weekday_mask, 'color': 'blue'}
    ]

    fig, ax = plt.subplots(figsize=(6, 3), layout='constrained') # CHANGE: single Axes
    for config in configs:
        _, _, config['hist'] = ax.hist(
            data[config['mask']].loc[date_range], bin_ranges, ec='black',
            facecolor=config['color'], alpha=0.5, label=config['label']
        ) # CHANGES: ^ color the bar and ^ add transparency

    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    despine(ax)
    
    # CHANGES: update formatting and add legend
    ax.set(
        xlim=(0, None), ylim=(0, 120), xlabel='Entries', ylabel='Frequency',
        title='Histogram of Daily Subway Entries in Manhattan'
    )
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1), ncols=2, frameon=False)

    return fig, ax, bin_ranges, configs


def add_time_text(ax):
    time_text = ax.text(
        0.075, 0.9, '', transform=ax.transAxes,
        fontsize=12, ha='center', va='center'
    )
    return time_text


def update(frame, *, data, configs, time_text, bin_ranges):
    artists = []

    time = frame.strftime('%b\n%Y')
    if time != time_text.get_text():
        time_text.set_text(time)
        artists.append(time_text)

    for config in configs:
        time_frame_mask = \
            (data.index > frame - pd.Timedelta(days=365)) & (data.index <= frame)
        counts, _ = np.histogram(
            data[time_frame_mask & config['mask']],
            bin_ranges
        )
        for count, rect in zip(counts, config['hist'].patches):
            if count != rect.get_height():
                rect.set_height(count)
                artists.append(rect)

    return artists


def histogram_init(data, bins, initial_date_range):
    fig, ax, bin_ranges, configs = subway_histogram(
        data, bins, initial_date_range
    ) # CHANGE: rename variable `ax`

    update_func = partial(
        update, data=data, configs=configs,
        time_text=add_time_text(ax), # CHANGE: pass in `ax`
        bin_ranges=bin_ranges
    )

    return fig, update_func


fig, update_func = histogram_init(
    manhattan_entries, bins=30, initial_date_range=slice('2017', '2019-07')
)

ani = FuncAnimation(
    fig, update_func, frames=manhattan_entries['2019-08':'2021'].index,
    repeat=False, blit=True
)
ani.save(
    '../media/subway_entries_exercise.mp4', # CHANGE: new filename
    writer='ffmpeg', fps=30, bitrate=500, dpi=300
)
plt.close()


from IPython import display

display.Video(
    '../media/subway_entries_exercise.mp4', width=600, height=300,
    embed=True, html_attributes='controls muted autoplay'
)


import geopandas as gpd

earthquakes = gpd.read_file('../data/earthquakes.geojson').assign(
    time=lambda x: pd.to_datetime(x.time, unit='ms'),
    month=lambda x: x.time.dt.month
)[['geometry', 'mag', 'time', 'month']]

earthquakes.shape

(188527, 4)


earthquakes.head()


import warnings

import geoviews as gv
import geoviews.feature as gf
import holoviews as hv

with warnings.catch_warnings(action='ignore', category=FutureWarning):
    gv.extension('matplotlib')


import calendar

def plot_earthquakes(data, month_num):
    points = gv.Points(
        data.query(f'month == {month_num}'),
        kdims=['longitude', 'latitude'], # key dimensions (for coordinates in this case)
        vdims=['mag'] # value dimensions (for modifying the plot in this case)
    ).redim.range(mag=(-2, 10), latitude=(-90, 90))

    # create an overlay by combining Cartopy features and the points with *
    overlay = gf.land * gf.coastline * gf.borders * points

    return overlay.opts(
        gv.opts.Points(color='mag', cmap='fire_r', colorbar=True, alpha=0.75),
        gv.opts.Overlay(
            global_extent=False, title=calendar.month_name[month_num], fontscale=2
        )
    )


plot_earthquakes(earthquakes, 1).opts(
    fig_inches=(6, 3), aspect=2, fig_size=250, fig_bounds=(0.07, 0.05, 0.87, 0.95)
)


frames = {
    month_num: plot_earthquakes(earthquakes, month_num)
    for month_num in range(1, 13)
}
holomap = hv.HoloMap(frames)


hv.output(
    holomap.opts(
        fig_inches=(6, 3), aspect=2, fig_size=250,
        fig_bounds=(0.07, 0.05, 0.87, 0.95)
    ), holomap='gif', fps=5
)

hv.save(
    holomap.opts(
        fig_inches=(6, 3), aspect=2, fig_size=250,
        fig_bounds=(0.07, 0.05, 0.87, 0.95)
    ), 'earthquakes.gif', fps=5
)


import geopandas as gpd
import pandas as pd

earthquakes = gpd.read_file('../data/earthquakes.geojson').assign(
    time=lambda x: pd.to_datetime(x.time, unit='ms'),
    month=lambda x: x.time.dt.month
)[['geometry', 'mag', 'time', 'month']]

earthquakes.head()


import geoviews as gv
import geoviews.feature as gf
import holoviews as hv

gv.extension('matplotlib')


def plot_earthquakes(data, date):
    points = gv.Points( # CHANGE: filter `data` by `date`
        data.query(f'time.dt.strftime("%Y-%m-%d") == "{date}"'),
        kdims=['longitude', 'latitude'],
        vdims=['mag']
    ).redim.range(mag=(-2, 10), latitude=(-90, 90))

    overlay = gf.land * gf.coastline * gf.borders * points

    return overlay.opts(
        gv.opts.Points(color='mag', cmap='fire_r', colorbar=True, alpha=0.75),
        gv.opts.Overlay(
            global_extent=False, title=f'{date:%B %d, %Y}', fontscale=2
        ) # CHANGE: title each frame with the date ^
    )


import datetime as dt

frames = {
    day: plot_earthquakes(earthquakes, dt.date(2020, 4, day))
    for day in range(1, 31)
}
holomap = hv.HoloMap(frames)


hv.output(
    holomap.opts(
        fig_inches=(6, 3), aspect=2, fig_size=250,
        fig_bounds=(0.07, 0.05, 0.87, 0.95)
    ), holomap='gif', fps=5
)


import geopandas as gpd
import pandas as pd

earthquakes = gpd.read_file('../data/earthquakes.geojson').assign(
    time=lambda x: pd.to_datetime(x.time, unit='ms'),
    month=lambda x: x.time.dt.month
).dropna()

earthquakes.head()


import warnings

from cartopy import crs
import geoviews as gv
import geoviews.feature as gf

with warnings.catch_warnings(action='ignore', category=FutureWarning):
    gv.extension('bokeh')


points = gv.Points(
    earthquakes,
    kdims=['longitude', 'latitude'],
    vdims=['month', 'place', 'tsunami', 'mag', 'magType']
)

# set colorbar limits for magnitude and axis limits
points = points.redim.range(
    mag=(-2, 10), longitude=(-180, 180), latitude=(-90, 90)
)


overlay = gf.land * gf.coastline * gf.borders * points.groupby('month')


interactive_map = overlay.opts(
    gv.opts.Feature(projection=crs.PlateCarree()),
    gv.opts.Overlay(width=700, height=450),
    gv.opts.Points(color='mag', cmap='fire_r', colorbar=True, tools=['hover'])
)


import panel as pn

earthquake_viz = pn.panel(interactive_map, widget_location='bottom')


january_earthquakes = earthquakes.query('month == 1').assign(
    longitude=lambda x: x.geometry.x,
    latitude=lambda x: x.geometry.y
).drop(columns=['month', 'geometry'])


import hvplot.pandas


geo = january_earthquakes.hvplot(
    x='longitude', y='latitude', kind='points',
    color='mag', cmap='fire_r', clim=(-2, 10),
    tiles='CartoLight', geo=True, global_extent=True,
    xlabel='Longitude', ylabel='Latitude', title='January 2020 Earthquakes',
    frame_height=450
)


table = january_earthquakes.sort_values(['longitude', 'latitude']).hvplot(
    kind='table', width=650, height=450, title='Raw Data'
)


layout = geo + table


import holoviews as hv

# temporary bug fix for this example from
# https://github.com/holoviz/holoviews/issues/4645#issuecomment-1907032005
from geoviews.element.geo import WMTS
WMTS._transforms = []

selection = hv.link_selections.instance()
map_and_table = selection(layout)


selection.filter(january_earthquakes).nlargest(3, 'mag')


import geopandas as gpd
import pandas as pd

earthquakes = gpd.read_file('../data/earthquakes.geojson').assign(
    time=lambda x: pd.to_datetime(x.time, unit='ms'),
    month=lambda x: x.time.dt.month
).dropna()

earthquakes.head()


import hvplot.pandas
import panel as pn

pn.panel(earthquakes[['mag', 'magType']].hvplot(
    kind='hist', x='mag', groupby='magType', ylabel='frequency',
    frame_height=200, responsive=True, widget_location='left'
)).embed()


import numpy as np

flight_stats = pd.read_csv(
    '../data/T100_MARKET_ALL_CARRIER.zip',
    usecols=[
        'CLASS', 'REGION', 'UNIQUE_CARRIER_NAME', 'ORIGIN_CITY_NAME', 'ORIGIN', 
        'DEST_CITY_NAME', 'DEST', 'PASSENGERS', 'FREIGHT', 'MAIL'
    ]
).rename(lambda x: x.lower(), axis=1).assign(
    region=lambda x: x.region.replace({
        'D': 'Domestic', 'I': 'International', 'A': 'Atlantic', 
        'L': 'Latin America', 'P': 'Pacific', 'S': 'System'
    }),
    route=lambda x: np.where(
        x.origin < x.dest,
        x.origin + '-' + x.dest,
        x.dest + '-' + x.origin
    )
)


flight_stats.head()


cities = [
    'Atlanta, GA', 'Chicago, IL', 'New York, NY', 'Los Angeles, CA',
    'Dallas/Fort Worth, TX', 'Denver, CO', 'Houston, TX', 
    'San Francisco, CA', 'Seattle, WA', 'Orlando, FL'
]

top_airlines = [
    'American Airlines Inc.', 'Delta Air Lines Inc.', 'JetBlue Airways',
    'Southwest Airlines Co.', 'United Air Lines Inc.'
]


total_flight_stats = flight_stats.query(
    '`class` == "F" and origin_city_name != dest_city_name'
    f' and origin_city_name.isin({cities}) and dest_city_name.isin({cities})'
).groupby([
    'origin', 'origin_city_name', 'dest', 'dest_city_name'
])[['passengers', 'freight', 'mail']].sum().reset_index().query('passengers > 0')


total_flight_stats.sample(10, random_state=1)


chord = hv.Chord(
    total_flight_stats,
    kdims=['origin', 'dest'], 
    vdims=['passengers', 'origin_city_name', 'dest_city_name', 'mail', 'freight']
)


from bokeh.models import HoverTool

tooltips = {
    'Source': '@origin_city_name (@origin)',
    'Target': '@dest_city_name (@dest)',
    'Passengers': '@passengers{0,.}',
    'Mail': '@mail{0,.} lbs.',
    'Freight': '@freight{0,.} lbs.',
}
hover = HoverTool(tooltips=tooltips)


chord = chord.opts(
    labels='index', node_color='index', cmap='Category20', # node config
    edge_color='origin', edge_cmap='Category20', directed=True, # edge config
    inspection_policy='edges', tools=[hover, 'tap'], # tooltip config
    frame_width=500, aspect=1, # plot size config
    title='Total Passenger Service Travel Between Top 10 Cities in 2019'
)


top_cities = cities[:5]

domestic_passenger_travel = flight_stats.query(
    'region == "Domestic" and `class` == "F" and origin_city_name != dest_city_name '
    f'and origin_city_name.isin({top_cities}) and dest_city_name.isin({top_cities})'
).groupby([
    'region', 'unique_carrier_name', 'route', 'origin_city_name', 'dest_city_name'
]).passengers.sum().reset_index()

domestic_passenger_travel.head()


domestic_passenger_travel.unique_carrier_name = (
    domestic_passenger_travel.unique_carrier_name.replace(
        '^(?!' + '|'.join(top_airlines) + ').*$',
        'Other Airlines',
        regex=True
    )
)


domestic_passenger_travel.groupby('unique_carrier_name').passengers.sum().div(
    domestic_passenger_travel.passengers.sum()
)

unique_carrier_name
American Airlines Inc.    0.337186
Delta Air Lines Inc.      0.312187
JetBlue Airways           0.049500
Other Airlines            0.120544
Southwest Airlines Co.    0.079074
United Air Lines Inc.     0.101509
Name: passengers, dtype: float64


def get_edges(data, *, source_col, target_col):
    aggregated = data.groupby([source_col, target_col]).passengers.sum()
    return aggregated.reset_index().rename(
        columns={source_col: 'source', target_col: 'target'}
    ).query('passengers > 0')


carrier_edges = get_edges(
    domestic_passenger_travel, 
    source_col='region',
    target_col='unique_carrier_name'
).replace('^Domestic$', 'Top Routes', regex=True)

carrier_edges


carrier_to_route_edges = get_edges(
    domestic_passenger_travel,
    source_col='unique_carrier_name',
    target_col='route'
)

carrier_to_route_edges.sample(10, random_state=1)


all_edges = pd.concat([carrier_edges, carrier_to_route_edges]).assign(
    passengers=lambda x: x.passengers / 1e6
)


sankey = hv.Sankey(
    all_edges, 
    kdims=['source', 'target'],
    vdims=hv.Dimension('passengers', unit='M')
).opts(
    labels='index', label_position='right', cmap='Set1', # node config 
    edge_color='lightgray', # edge config
    width=750, height=600, # plot size config
    title='Travel Between the Top 5 Cities in 2019'
)


import geopandas as gpd
import pandas as pd

earthquakes = gpd.read_file('../data/earthquakes.geojson').assign(
    time=lambda x: pd.to_datetime(x.time, unit='ms'),
    month=lambda x: x.time.dt.month
).dropna()

earthquakes.head()


from bokeh.models import HoverTool, DatetimeTickFormatter
import hvplot.pandas
import panel as pn


hover = HoverTool(
    tooltips=dict(date='@time{%b %d}', earthquakes='@0{0,.}'),
    formatters={'@time': 'datetime'}
)
line_plot = earthquakes.resample('1D', on='time').size().hvplot(
    title='Earthquakes per Day in 2020', ylabel='total earthquakes',
    tools=[hover], responsive=True, frame_height=200,
    xformatter=DatetimeTickFormatter(months='%B')
)
pn.panel(line_plot)

	pandas	matplotlib	numpy	seaborn	geopandas	geoviews	altair	yellowbrick	vega	holoviews	hvplot	bokeh
creation_date
2021-09-08	136.933333	26.966667	37.133333	5.766667	1.833333	0.000000	0.500000	0.133333	0.500000	0.400000	0.033333	1.033333
2021-09-09	138.000000	27.033333	37.933333	5.766667	1.833333	0.000000	0.533333	0.133333	0.566667	0.400000	0.000000	1.033333
2021-09-10	137.100000	26.733333	37.966667	5.800000	1.833333	0.000000	0.533333	0.133333	0.566667	0.366667	0.000000	1.066667
2021-09-11	133.433333	26.400000	37.233333	5.666667	1.833333	0.000000	0.533333	0.133333	0.533333	0.333333	0.000000	1.000000
2021-09-12	130.466667	25.933333	36.666667	5.666667	1.733333	0.033333	0.533333	0.133333	0.533333	0.233333	0.000000	0.866667

	hvplot	geoviews	seaborn	holoviews	geopandas	matplotlib	numpy	bokeh	pandas
library
hvplot	0.000000	0.032609	0.021739	0.539855	0.048913	0.023551	0.000000	0.110507	0.103261
geoviews	0.047619	0.000000	0.000000	0.431217	0.058201	0.039683	0.007937	0.140212	0.084656
seaborn	0.000270	0.000000	0.000000	0.000135	0.000607	0.468033	0.015871	0.002001	0.230780
holoviews	0.091636	0.050123	0.001845	0.000000	0.003075	0.030750	0.010148	0.371771	0.066113
geopandas	0.002724	0.002220	0.002724	0.001009	0.000000	0.118644	0.009988	0.011703	0.250504
matplotlib	0.000036	0.000041	0.057304	0.000275	0.003237	0.000000	0.087198	0.001594	0.135548
numpy	0.000000	0.000005	0.001254	0.000059	0.000176	0.056294	0.000000	0.000144	0.167267
bokeh	0.002318	0.002014	0.003382	0.045942	0.004408	0.022002	0.003078	0.000000	0.087855
pandas	0.000044	0.000025	0.007961	0.000167	0.001926	0.038191	0.073000	0.001793	0.000000

city	Atlanta	Boston	Chicago	Honolulu	Houston	LA	Miami	NYC	Phoenix	SF	Seattle
date
2020-03-31	26.04	8.29	7.03	5.75	7.70	4.49	5.03	8.25	3.47	2.35	16.39
2020-06-30	11.48	8.96	16.96	3.20	14.08	2.80	26.71	6.50	0.00	1.61	7.07
2020-09-30	14.69	5.13	5.60	0.99	13.00	0.00	24.56	13.81	0.10	0.05	2.82
2020-12-31	12.56	13.94	7.27	3.51	7.25	1.75	22.73	11.50	0.44	1.56	11.74

	Entries				Exits
Borough	Bk	Bx	M	Q	Bk	Bx	M	Q
Datetime
2017-02-04	617650.0	247539.0	1390496.0	408736.0	417449.0	148237.0	1225689.0	279699.0
2017-02-05	542667.0	199078.0	1232537.0	339716.0	405607.0	139856.0	1033610.0	268626.0
2017-02-06	1184916.0	472846.0	2774016.0	787206.0	761166.0	267991.0	2240027.0	537780.0
2017-02-07	1192638.0	470573.0	2892462.0	790557.0	763653.0	270007.0	2325024.0	544828.0
2017-02-08	1243658.0	497412.0	2998897.0	825679.0	788356.0	275695.0	2389534.0	559639.0

	pandas	matplotlib	numpy	seaborn	geopandas	geoviews	altair	yellowbrick	vega	holoviews	hvplot	bokeh
2021-05-31	200734.0	57853.0	89812.0	6855.0	1456.0	57.0	716.0	46.0	532.0	513.0	84.0	4270.0
2021-06-30	205065.0	58602.0	91026.0	7021.0	1522.0	57.0	760.0	48.0	557.0	521.0	88.0	4308.0
2021-07-31	209235.0	59428.0	92254.0	7174.0	1579.0	62.0	781.0	50.0	572.0	528.0	89.0	4341.0
2021-08-31	213410.0	60250.0	93349.0	7344.0	1631.0	62.0	797.0	52.0	589.0	541.0	92.0	4372.0
2021-09-30	214919.0	60554.0	93797.0	7414.0	1652.0	63.0	804.0	54.0	598.0	542.0	92.0	4386.0

	pandas	matplotlib	numpy	seaborn	geopandas	geoviews	altair	yellowbrick	vega	holoviews	hvplot	bokeh
creation_date
2018-06-30	2690	612	931	75	12	0	9	0	10	9	0	82
2014-12-31	417	280	420	17	1	0	0	0	0	0	0	20
2012-12-31	124	159	209	0	0	0	0	0	0	0	0	0
2011-04-30	2	58	101	0	0	0	0	0	0	0	0	0
2011-08-31	0	74	124	0	0	0	0	0	0	0	0	0

	city	AWND	PRCP	SNOW	TAVG	TMAX	TMIN
date
2020-01-01	Atlanta	7.2	0.0	0.0	45.0	57.0	36.0
2020-01-01	Boston	15.4	0.0	0.0	39.0	43.0	36.0
2020-01-01	Chicago	11.9	0.0	0.0	28.0	42.0	21.0
2020-01-01	Honolulu	6.3	0.0	NaN	76.0	81.0	68.0
2020-01-01	Houston	6.5	0.1	0.0	52.0	60.0	47.0

	pandas	numpy	matplotlib	seaborn
creation_date
2008-09-30	0	3	2	0
2008-10-31	0	2	0	0
2008-11-30	0	3	0	0
2008-12-31	0	4	2	0
2009-01-31	0	7	1	0

	geometry	mag	time	month
0	POINT Z (-67.12750 19.21750 12.00000)	2.75	2020-01-01 00:01:56.590	1
1	POINT Z (-67.09010 19.07660 6.00000)	2.55	2020-01-01 00:03:38.210	1
2	POINT Z (-66.85410 17.87050 6.00000)	1.81	2020-01-01 00:05:09.440	1
3	POINT Z (-66.86360 17.89930 8.00000)	1.84	2020-01-01 00:05:36.930	1
4	POINT Z (-66.86850 17.90660 8.00000)	1.64	2020-01-01 00:09:20.060	1

	mag	place	time	magType	geometry	month
0	2.75	80 km N of Isabela, Puerto Rico	2020-01-01 00:01:56.590	md	POINT Z (-67.12750 19.21750 12.00000)	1
1	2.55	64 km N of Isabela, Puerto Rico	2020-01-01 00:03:38.210	md	POINT Z (-67.09010 19.07660 6.00000)	1
2	1.81	12 km SSE of Maria Antonia, Puerto Rico	2020-01-01 00:05:09.440	md	POINT Z (-66.85410 17.87050 6.00000)	1
3	1.84	9 km SSE of Maria Antonia, Puerto Rico	2020-01-01 00:05:36.930	md	POINT Z (-66.86360 17.89930 8.00000)	1
4	1.64	8 km SSE of Maria Antonia, Puerto Rico	2020-01-01 00:09:20.060	md	POINT Z (-66.86850 17.90660 8.00000)	1

	mag	place	time	tsunami	magType	longitude	latitude
16362	5.1	270 km SE of Chiniak, Alaska	2020-01-31 11:25:37.262	1	mww	-149.3295	55.7981
911	5.0	217 km SSE of Old Harbor, Alaska	2020-01-02 08:54:33.083	1	mww	-151.4274	55.5493
7831	4.3	258 km SE of Chiniak, Alaska	2020-01-13 09:00:21.044	0	mb	-149.3261	55.9471

	freight	unique_carrier_name	region	origin	origin_city_name	dest	dest_city_name	class	route
0	53185.0	Emirates	International	DXB	Dubai, United Arab Emirates	IAH	Houston, TX	G	DXB-IAH
1	9002.0	Emirates	International	DXB	Dubai, United Arab Emirates	JFK	New York, NY	G	DXB-JFK
2	2220750.0	Emirates	International	DXB	Dubai, United Arab Emirates	ORD	Chicago, IL	G	DXB-ORD
3	1201490.0	Emirates	International	IAH	Houston, TX	DXB	Dubai, United Arab Emirates	G	DXB-IAH
4	248642.0	Emirates	International	JFK	New York, NY	DXB	Dubai, United Arab Emirates	G	DXB-JFK

	origin	origin_city_name	dest	dest_city_name	passengers	freight	mail
78	LGA	New York, NY	DEN	Denver, CO	589190.0	506023.0	293108.0
117	ORD	Chicago, IL	SEA	Seattle, WA	810594.0	1063463.0	2627325.0
31	DFW	Dallas/Fort Worth, TX	MCO	Orlando, FL	683700.0	187672.0	95570.0
5	ATL	Atlanta, GA	LAX	Los Angeles, CA	1121378.0	8707125.0	3267077.0
126	SEA	Seattle, WA	LGA	New York, NY	24.0	0.0	0.0
45	IAH	Houston, TX	ATL	Atlanta, GA	566369.0	367543.0	726670.0
14	DEN	Denver, CO	HOU	Houston, TX	305193.0	363119.0	0.0
44	HOU	Houston, TX	SFO	San Francisco, CA	1843.0	5523.0	0.0
73	LAX	Los Angeles, CA	MDW	Chicago, IL	277226.0	2022416.0	0.0
89	MCO	Orlando, FL	DEN	Denver, CO	594878.0	368516.0	138811.0

	region	unique_carrier_name	route	origin_city_name	dest_city_name	passengers
0	Domestic	Air Wisconsin Airlines Corp	ATL-ORD	Atlanta, GA	Chicago, IL	915.0
1	Domestic	Air Wisconsin Airlines Corp	ATL-ORD	Chicago, IL	Atlanta, GA	556.0
2	Domestic	Alaska Airlines Inc.	JFK-LAX	Los Angeles, CA	New York, NY	265307.0
3	Domestic	Alaska Airlines Inc.	JFK-LAX	New York, NY	Los Angeles, CA	257685.0
4	Domestic	Alaska Airlines Inc.	LAX-ORD	Chicago, IL	Los Angeles, CA	48269.0

	source	target	passengers
0	Top Routes	American Airlines Inc.	9426060.0
1	Top Routes	Delta Air Lines Inc.	8727210.0
2	Top Routes	JetBlue Airways	1383776.0
3	Top Routes	Other Airlines	3369815.0
4	Top Routes	Southwest Airlines Co.	2210533.0
5	Top Routes	United Air Lines Inc.	2837682.0

	source	target	passengers
39	Other Airlines	DFW-LGA	157366.0
41	Other Airlines	JFK-LAX	523222.0
2	American Airlines Inc.	ATL-LAX	294304.0
48	Southwest Airlines Co.	ATL-MDW	498481.0
50	Southwest Airlines Co.	LAX-MDW	558574.0
44	Other Airlines	LAX-ORD	378552.0
33	Other Airlines	ATL-LAX	146882.0
35	Other Airlines	ATL-MDW	1201.0
40	Other Airlines	DFW-ORD	241147.0
27	JetBlue Airways	DFW-JFK	140.0

Beyond the Basics: Data Visualization in Python¶

Stefanie Molin¶

Bio¶

Prerequisites¶

Session Outline¶

Let's get started¶

Section 1: Getting Started With Matplotlib¶

Learning Path¶

Why start with Matplotlib?¶

Flexibility¶

Customization¶

Extensibility¶

Learning Path¶

Matplotlib basics¶

Functional (implicit) approach¶

Object-oriented (explicit) approach¶

Learning Path¶

Plotting with Matplotlib¶

Line plot¶

1. Create the line plot.¶

2. Add a shaded region for $\pm$2 standard deviations from the mean.¶

3. Set the axis labels, y-axis limits, plot title, and despine the plot.¶

Scatter plot¶

1. Create the scatter plot.¶

2. Convert to Matplotlib dates.¶

3. Add the best fit lines.¶

4. Label the axes, add a legend, and despine.¶

5. Format both the x- and y-axis tick labels.¶

Exercise 1.1¶

Using the data in weather.csv, plot the daily average temperature (TAVG) for both LA and NYC. Fill in all sections where NYC's daily average temperature was higher than LA's in 2020.¶

Solution¶

Area plot¶

1. Create the area plot.¶

2. Label and format the axes, provide a title, and despine the plot.¶

3. Add annotations.¶

More Annotations¶

Bar plot¶

1. Add the inset Axes object to the Figure object.¶

2. Create the horizontal bar plot.¶

3. Label and format the plot.¶

Annotating bars¶

1. Create the bar plot.¶

2. Label, format, and apply a log scale to the plot.¶

3. Annotate each of the bars.¶

Exercise 1.2¶

Using the data in weather.csv, make a vertical bar plot showing total monthly precipitation (PRCP) in Seattle. Annotate the bars.¶

Solution¶

Stacked bar plot¶

1. Create the stacked bar plot.¶

2. Add the legend.¶

3. Label and format the plot.¶

4. Annotate the bars.¶

5. Change the color scheme.¶

Exercise 1.3¶

Using the data in weather.csv, create a stacked horizontal bar plot of total precipitation per city per quarter (each city will have four segments – one for the total precipitation in each quarter of the year). Add a vertical line at Seattle's total precipitation.¶

Solution¶

Histogram¶

1. Create the histogram.¶

2. Label and format the plot.¶

3. Explore the use of subplots.¶

4. Apply a style sheet.¶

Box plot¶

Exercise 1.4¶

Using the data in weather.csv, generate histograms for the daily average wind (AWND) in each of the cities. Make sure to use subplots that share both the x- and y-axis.¶

Solution¶

Additional resources¶

Section 1 Complete 🎉¶

Section 2: Moving Beyond Static Visualizations¶

Learning Path¶

Animating cumulative values over time¶

1. Create a dataset of cumulative questions per library over time.¶

2. Import the FuncAnimation class.¶

3. Write a function for generating the initial plot.¶

4. Write a function for generating annotations and plot text.¶

5. Define the plot update function.¶

6. Bind arguments to the update function.¶

7. Animate the plot.¶

Learning Path¶

Animating distributions over time¶

1. Create a dataset of daily subway entries.¶

Using the data in `weather.csv`, plot the daily average temperature (`TAVG`) for both LA and NYC. Fill in all sections where NYC's daily average temperature was higher than LA's in 2020.¶

1. Add the inset `Axes` object to the `Figure` object.¶

Using the data in `weather.csv`, make a vertical bar plot showing total monthly precipitation (`PRCP`) in Seattle. Annotate the bars.¶

Using the data in `weather.csv`, create a stacked horizontal bar plot of total precipitation per city per quarter (each city will have four segments – one for the total precipitation in each quarter of the year). Add a vertical line at Seattle's total precipitation.¶

Using the data in `weather.csv`, generate histograms for the daily average wind (`AWND`) in each of the cities. Make sure to use subplots that share both the x- and y-axis.¶

2. Import the `FuncAnimation` class.¶

1. Modify the `subway_histogram()` function to account for bar color and transparency, as well as plotting everything on a single `Axes` object.¶

3. Modify the `histogram_init()` function to account for a single `Axes` object.¶

1. Modify the `plot_earthquakes()` function to filter by date instead of month and use the date for the title.¶

2. Generate frames per day in April, and create a `HoloMap` object.¶

Using hvPlot, make histograms of earthquake magnitude (`mag`) for each magnitude type (`magType`) with a dropdown to select the magnitude type.¶

Use hvPlot to create a line plot of total earthquakes per day. Add custom tooltips using Bokeh's `HoverTool`.¶