import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv', nrows=5)
meteorites

meteorites.name

0      Aachen
1      Aarhus
2        Abee
3    Acapulco
4     Achiras
Name: name, dtype: object

meteorites.columns

Index(['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year',
       'reclat', 'reclong', 'GeoLocation'],
      dtype='object')

meteorites.index

RangeIndex(start=0, stop=5, step=1)

import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv')

import requests

response = requests.get(
    'https://data.nasa.gov/docs/legacy/meteorite_landings/gh4g-9sfh.json',
    params={'$limit': 50_000}
)

if response.ok:
    payload = response.json()
else:
    print(f'Request was not successful and returned code: {response.status_code}.')
    payload = None

import pandas as pd

df = pd.DataFrame(payload)
df.head(3)

meteorites.shape

(45716, 10)

meteorites.columns

Index(['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year',
       'reclat', 'reclong', 'GeoLocation'],
      dtype='object')

meteorites.dtypes

name            object
id               int64
nametype        object
recclass        object
mass (g)       float64
fall            object
year            object
reclat         float64
reclong        float64
GeoLocation     object
dtype: object

meteorites.head()

meteorites.tail()

meteorites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  object 
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ MB

import pandas as pd

taxis = pd.read_csv('../data/2019_Yellow_Taxi_Trip_Data.csv')
taxis.head()

taxis.shape

(10000, 18)

meteorites.name

0            Aachen
1            Aarhus
2              Abee
3          Acapulco
4           Achiras
            ...    
45711    Zillah 002
45712        Zinder
45713          Zlin
45714     Zubkovsky
45715    Zulu Queen
Name: name, Length: 45716, dtype: object

meteorites[['name', 'mass (g)']]

meteorites[100:104]

meteorites.iloc[100:104, [0, 3, 4, 6]]

meteorites.loc[100:104, 'mass (g)':'year']

(meteorites['mass (g)'] > 50) & (meteorites.fall == 'Found')

0        False
1        False
2        False
3        False
4        False
         ...  
45711     True
45712    False
45713    False
45714     True
45715     True
Length: 45716, dtype: bool

meteorites[(meteorites['mass (g)'] > 1e6) & (meteorites.fall == 'Fell')]

meteorites.query("`mass (g)` > 1e6 and fall == 'Fell'")

meteorites.fall.value_counts()

fall
Found    44609
Fell      1107
Name: count, dtype: int64

meteorites.value_counts(subset=['nametype', 'fall'], normalize=True)

nametype  fall 
Valid     Found    0.974145
          Fell     0.024215
Relict    Found    0.001641
Name: proportion, dtype: float64

meteorites['mass (g)'].mean()

np.float64(13278.078548601512)

meteorites['mass (g)'].quantile([0.01, 0.05, 0.5, 0.95, 0.99])

0.01        0.44
0.05        1.10
0.50       32.60
0.95     4000.00
0.99    50600.00
Name: mass (g), dtype: float64

meteorites['mass (g)'].median()

np.float64(32.6)

meteorites['mass (g)'].max()

np.float64(60000000.0)

meteorites.loc[meteorites['mass (g)'].idxmax()]

name                             Hoba
id                              11890
nametype                        Valid
recclass                    Iron, IVB
mass (g)                   60000000.0
fall                            Found
year           01/01/1920 12:00:00 AM
reclat                      -19.58333
reclong                      17.91667
GeoLocation     (-19.58333, 17.91667)
Name: 16392, dtype: object

meteorites.recclass.nunique()

466

meteorites.recclass.unique()[:14]

array(['L5', 'H6', 'EH4', 'Acapulcoite', 'L6', 'LL3-6', 'H5', 'L',
       'Diogenite-pm', 'Unknown', 'H4', 'H', 'Iron, IVA', 'CR2-an'],
      dtype=object)

meteorites.describe(include='all')

import pandas as pd

taxis = pd.read_csv('../data/2019_Yellow_Taxi_Trip_Data.csv')
taxis[['fare_amount', 'tip_amount', 'tolls_amount', 'total_amount']].describe()

taxis.loc[
    taxis.trip_distance.idxmax(), 
    ['fare_amount', 'tip_amount', 'tolls_amount', 'total_amount']
]

fare_amount      176.0
tip_amount       18.29
tolls_amount      6.12
total_amount    201.21
Name: 8338, dtype: object

import pandas as pd

taxis = pd.read_csv('../data/2019_Yellow_Taxi_Trip_Data.csv')
taxis.head()

mask = taxis.columns.str.contains('id$|store_and_fwd_flag', regex=True)
columns_to_drop = taxis.columns[mask]
columns_to_drop

Index(['vendorid', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid',
       'dolocationid'],
      dtype='object')

taxis = taxis.drop(columns=columns_to_drop)
taxis.head()

taxis = taxis.rename(
    columns={
        'tpep_pickup_datetime': 'pickup', 
        'tpep_dropoff_datetime': 'dropoff'
    }
)
taxis.columns

Index(['pickup', 'dropoff', 'passenger_count', 'trip_distance', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'congestion_surcharge'],
      dtype='object')

taxis.dtypes

pickup                    object
dropoff                   object
passenger_count            int64
trip_distance            float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
dtype: object

taxis[['pickup', 'dropoff']] = \
    taxis[['pickup', 'dropoff']].apply(pd.to_datetime)
taxis.dtypes

pickup                   datetime64[ns]
dropoff                  datetime64[ns]
passenger_count                   int64
trip_distance                   float64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

taxis = taxis.assign(
    elapsed_time=lambda x: x.dropoff - x.pickup, # 1
    cost_before_tip=lambda x: x.total_amount - x.tip_amount,
    tip_pct=lambda x: x.tip_amount / x.cost_before_tip, # 2
    fees=lambda x: x.cost_before_tip - x.fare_amount, # 3
    avg_speed=lambda x: x.trip_distance.div(
        x.elapsed_time.dt.total_seconds() / 60 / 60
    ) # 4
)

taxis.head(2)

taxis.sort_values(['passenger_count', 'pickup'], ascending=[False, True]).head()

taxis.nlargest(3, 'elapsed_time')

import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv')
meteorites = meteorites\
    .rename(columns={'mass (g)': 'mass'})\
    .drop(columns=meteorites.columns[-3:])\
    .sort_values('mass', ascending=False)
meteorites.head()

taxis = taxis.set_index('pickup')
taxis.head(3)

taxis = taxis.sort_index()

taxis['2019-10-23 07:45':'2019-10-23 08']

taxis.loc['2019-10-23 08']

taxis = taxis.reset_index()
taxis.head()

import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv').assign(
    year=lambda x: pd.to_numeric(x.year.str.slice(6, 10)),
    pre_1970=lambda x: (x.fall == 'Fell') & (x.year < 1970)
).set_index('id')
meteorites.sort_index().loc[10_036:10_040]

pd.to_datetime(
    meteorites.year,
    errors='coerce',  # anything that can't be converted will be NaT (null)
    format='%m/%d/%Y %I:%M:%S %p'  # the format the datetimes are currently in
)

meteorites.year.describe()

count    45425.000000
mean      1991.828817
std         25.052766
min        860.000000
25%       1987.000000
50%       1998.000000
75%       2003.000000
max       2101.000000
Name: year, dtype: float64

meteorites.query(f'year > {pd.Timestamp("today").year}')

tsa = pd.read_csv('../data/tsa_passenger_throughput.csv', parse_dates=['Date'])
tsa.head()

tsa = tsa.rename(columns=lambda x: x.lower().split()[0])
tsa.head()

from utils import highlight_long_format

colors = {'2021': 'pink', '2020': 'skyblue', '2019': 'lightgreen'}
highlight_long_format(tsa.head(2), colors)

from utils import highlight_wide_format

highlight_wide_format(tsa.head(2), colors)

tsa_melted = tsa.melt(
    id_vars='date', # column that uniquely identifies a row (can be multiple)
    var_name='year', # name for the new column created by melting
    value_name='travelers' # name for new column containing values from melted columns
)
tsa_melted.sample(5, random_state=1) # show some random entries

tsa_melted = tsa_melted.assign(
    date=lambda x: pd.to_datetime(x.year + x.date.dt.strftime('-%m-%d'))
)
tsa_melted.sample(5, random_state=1)

tsa_melted.sort_values('date').tail(3)

tsa_melted = tsa_melted.dropna()
tsa_melted.sort_values('date').tail(3)

tsa_pivoted = tsa_melted\
    .query('date.dt.month == 3 and date.dt.day <= 10')\
    .assign(day_in_march=lambda x: x.date.dt.day)\
    .pivot(index='year', columns='day_in_march', values='travelers')
tsa_pivoted

tsa_pivoted.T

holidays = pd.read_csv('../data/holidays.csv', parse_dates=True, index_col='date')
holidays.loc['2019']

tsa_melted_holidays = tsa_melted\
    .merge(holidays, left_on='date', right_index=True, how='left')\
    .sort_values('date')
tsa_melted_holidays.head()

tsa_melted_holiday_travel = tsa_melted_holidays.assign(
    holiday=lambda x:
        x.holiday\
            .ffill(limit=1)\
            .bfill(limit=2)
)

tsa_melted_holiday_travel.query(
    'year == "2019" and '
    '(holiday == "Thanksgiving" or holiday.str.contains("Christmas"))'
)

tsa_melted_holiday_travel.pivot_table(
    index='year', columns='holiday', sort=False,
    values='travelers', aggfunc='sum'
)

tsa_melted_holiday_travel.pivot_table(
    index='year', columns='holiday', sort=False,
    values='travelers', aggfunc='sum'
).pct_change(fill_method=None)

pd.set_option('display.float_format', '{:,.0f}'.format)

import numpy as np

pivot_table = tsa_melted_holiday_travel.assign(
    year=lambda x: np.where(
        x.holiday == "New Year's Day", pd.to_numeric(x.year) - 1, x.year
    ).astype(str),
    holiday=lambda x: np.where(
        x.holiday.str.contains('Christmas|New Year', regex=True), 
        x.holiday.str.replace('Day|Eve', '', regex=True).str.strip(), 
        x.holiday
    )
).pivot_table(
    index='year', columns='holiday', sort=False,
    values='travelers', aggfunc='sum', 
    margins=True, margins_name='Total'
)
# reorder columns by order in the year
pivot_table.insert(5, "New Year's", pivot_table.pop("New Year's"))
pivot_table

pd.reset_option('display.float_format')

import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv').assign(
    year=lambda x: pd.to_numeric(x.year.str.slice(6, 10))
)
meteorites.query('year.between(2005, 2009)').pivot_table(
    index='year', columns='fall', values='mass (g)', 
    aggfunc=['count', lambda x: x.quantile(0.95)]
).rename(columns={'<lambda>': '95th percentile'})

pd.crosstab(
    index=pd.cut(
        tsa_melted_holiday_travel.travelers, 
        bins=3, labels=['low', 'medium', 'high']
    ),
    columns=tsa_melted_holiday_travel.year,
    rownames=['travel_volume']
)

tsa_melted_holiday_travel.groupby('year').describe(include=np.number)

tsa_melted_holiday_travel.assign(
    travel_volume_rank=lambda x: x.groupby('year').travelers.rank(ascending=False)
).sort_values(['travel_volume_rank', 'year']).head(3)

tsa_melted_holiday_travel.assign(
    holiday_travelers=lambda x: np.where(~x.holiday.isna(), x.travelers, np.nan),
    non_holiday_travelers=lambda x: np.where(x.holiday.isna(), x.travelers, np.nan),
    year=lambda x: pd.to_numeric(x.year)
).select_dtypes(include='number').groupby('year').agg(['mean', 'std'])

tsa_melted_holiday_travel.assign(
    holiday_travelers=lambda x: np.where(~x.holiday.isna(), x.travelers, np.nan),
    non_holiday_travelers=lambda x: np.where(x.holiday.isna(), x.travelers, np.nan)
).groupby('year').agg({
    'holiday_travelers': ['mean', 'std'], 
    'holiday': ['nunique', 'count']
})

import pandas as pd

meteorites = pd.read_csv('../data/Meteorite_Landings.csv')
meteorites.groupby('fall')['mass (g)'].describe()

taxis = taxis.set_index('dropoff').sort_index()

taxis['2019-10-24 12':'2019-10-24 13']

taxis.loc['2019-10-24 12']

taxis.between_time('12:00', '13:00')

tsa_melted_holiday_travel = tsa_melted_holiday_travel.set_index('date')

tsa_melted_holiday_travel.loc['2020'].assign(
    one_day_change=lambda x: x.travelers.diff(),
    seven_day_change=lambda x: x.travelers.diff(7),
).head(10)

tsa_melted_holiday_travel['2019':'2021-Q1'].select_dtypes(include='number')\
    .resample('QE').agg(['sum', 'mean', 'std'])

tsa_melted_holiday_travel.loc['2020'].assign(
    **{
        '7D MA': lambda x: x.rolling('7D').travelers.mean(),
        'YTD mean': lambda x: x.expanding().travelers.mean()
      }
).head(10)

import matplotlib_inline
from utils import mpl_svg_config

matplotlib_inline.backend_inline.set_matplotlib_formats(
    'svg', # output images using SVG format
    **mpl_svg_config('section-2') # optional: configure metadata
)

_ = tsa_melted_holiday_travel.loc['2020'].assign(
    **{
        '7D MA': lambda x: x.rolling('7D').travelers.mean(),
        'YTD mean': lambda x: x.expanding().travelers.mean()
      }
).plot(title='2020 TSA Traveler Throughput', ylabel='travelers', alpha=0.8)

import pandas as pd

taxis = pd.read_csv(
    '../data/2019_Yellow_Taxi_Trip_Data.csv',
    parse_dates=True, index_col='tpep_dropoff_datetime'
)
taxis.resample('1h')[[
    'trip_distance', 'fare_amount', 'tolls_amount', 'tip_amount'
]].sum().nlargest(5, 'tip_amount')

import pandas as pd

tsa_melted_holiday_travel = pd.read_csv(
    '../data/tsa_melted_holiday_travel.csv',
    parse_dates=True, index_col='date'
)
tsa_melted_holiday_travel.head()

import matplotlib_inline
from utils import mpl_svg_config

matplotlib_inline.backend_inline.set_matplotlib_formats(
    'svg', # output images using SVG format
    **mpl_svg_config('section-3') # optional: configure metadata
)

plot_data = tsa_melted_holiday_travel.drop(columns='year').loc['2020'].assign(
    **{
        '7D MA': lambda x: x.travelers.rolling('7D').mean(),
        'YTD mean': lambda x: x.travelers.expanding().mean()
      }
)
plot_data.head()

plot_data.plot(
    title='2020 TSA Traveler Throughput', ylabel='travelers', alpha=0.8
)

<Axes: title={'center': '2020 TSA Traveler Throughput'}, xlabel='date', ylabel='travelers'>

plot_data = tsa_melted_holiday_travel['2019':'2021-04']\
    .assign(month=lambda x: x.index.month)\
    .pivot_table(index='month', columns='year', values='travelers', aggfunc='sum')
plot_data.head()

import calendar
from matplotlib import ticker

ax = plot_data.plot(
    kind='bar', rot=0, xlabel='', ylabel='travelers',
    figsize=(8, 1.5), title='TSA Monthly Traveler Throughput'
)

# use month abbreviations for the ticks on the x-axis
ax.set_xticklabels(calendar.month_abbr[1:])

# show y-axis labels in millions instead of scientific notation
ax.yaxis.set_major_formatter(ticker.EngFormatter())

# customize the legend
ax.legend(title='', loc='center', bbox_to_anchor=(0.5, -0.3), ncols=3, frameon=False)

<matplotlib.legend.Legend at 0x119675590>

import matplotlib.pyplot as plt

# define the subplot layout
fig, axes = plt.subplots(3, 1, sharex=True, sharey=True, figsize=(6, 4))

for year, ax in zip(tsa_melted_holiday_travel.year.unique(), axes):
    plot_data = tsa_melted_holiday_travel.loc[str(year)].travelers
    plot_data.plot(kind='hist', legend=False, density=True, alpha=0.8, ax=ax)
    plot_data.plot(kind='kde', legend=False, color='blue', ax=ax)
    ax.set(title=f'{year} TSA Traveler Throughput', xlabel='travelers')

fig.tight_layout() # handle overlaps

from matplotlib import ticker
import pandas as pd

def ex1():
    df = pd.read_csv('../data/tsa_melted_holiday_travel.csv')
    plot_data = df.pivot(columns='year', values='travelers')
    ax = plot_data.plot(kind='box')
    ax.set(xlabel='year', ylabel='travelers', title='TSA Traveler Throughput')
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    return ax

ex1()

<Axes: title={'center': 'TSA Traveler Throughput'}, xlabel='year', ylabel='travelers'>

import seaborn as sns

sns.displot(
    data=tsa_melted_holiday_travel, x='travelers', col='year', kde=True, height=2.5
)

<seaborn.axisgrid.FacetGrid at 0x12a01a660>

data = tsa_melted_holiday_travel['2019':'2021-04']\
    .assign(month=lambda x: x.index.month)\
    .pivot_table(index='month', columns='year', values='travelers', aggfunc='sum')

data

ax = sns.heatmap(data=data / 1e6, cmap='Blues', annot=True, fmt='.1f')
_ = ax.set_yticklabels(calendar.month_abbr[1:], rotation=0)
_ = ax.set_title('Total TSA Traveler Throughput (in millions)')

import calendar

from matplotlib import ticker
import pandas as pd
import seaborn as sns

def ex2():
    df = pd.read_csv(
        '../data/tsa_melted_holiday_travel.csv',
        parse_dates=True, index_col='date'
    )
    plot_data = df.loc['2019'].assign(
        day_of_week=lambda x: x.index.dayofweek, month=lambda x: x.index.month
    ).pivot_table(
        index='day_of_week', columns='month', values='travelers', aggfunc='median'
    )

    ax = sns.heatmap(data=plot_data / 1e6, annot=True, fmt='.1f', cmap='Blues')
    ax.set_xticklabels(calendar.month_abbr[1:])
    ax.set_yticklabels(calendar.day_abbr, rotation=0)
    ax.set_title('2019 TSA Median Traveler Throughput\n(in millions)')

    return ax

ex2()

<Axes: title={'center': '2019 TSA Median Traveler Throughput\n(in millions)'}, xlabel='month', ylabel='day_of_week'>

plot_data = tsa_melted_holiday_travel['2019-05':'2019-11']
ax = plot_data.travelers.plot(
    title='TSA Traveler Throughput', ylabel='travelers', figsize=(9, 2)
)
ax.yaxis.set_major_formatter(ticker.EngFormatter())

# collect the holiday ranges (start and end dates)
holiday_ranges = plot_data.dropna().reset_index()\
    .groupby('holiday').agg({'date': ['min', 'max']})

# create shaded regions for each holiday in the plot
for start_date, end_date in holiday_ranges.to_numpy():
    ax.axvspan(start_date, end_date, color='gray', alpha=0.2)

plot_data = tsa_melted_holiday_travel.loc['2019']
ax = plot_data.travelers.plot(
    title='TSA Traveler Throughput', ylabel='travelers', figsize=(9, 2)
)
ax.yaxis.set_major_formatter(ticker.EngFormatter())

# highest throughput
max_throughput_date = plot_data.travelers.idxmax()
max_throughput = plot_data.travelers.max()
_ = ax.annotate(
    f'{max_throughput_date:%b %d}\n({max_throughput / 1e6:.2f} M)',
    xy=(max_throughput_date, max_throughput),
    xytext=(max_throughput_date - pd.Timedelta(days=25), max_throughput * 0.92),
    arrowprops={'arrowstyle': '->'}, ha='center'
)

from matplotlib import ticker
import pandas as pd


def ex1():
    df = pd.read_csv('../data/tsa_melted_holiday_travel.csv')
    plot_data = df.pivot(columns='year', values='travelers')
    ax = plot_data.plot(kind='box')
    ax.set(xlabel='year', ylabel='travelers', title='TSA Traveler Throughput')
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    return ax, plot_data

def ex3():
    ax, plot_data = ex1()

    # add annotations
    medians = plot_data.median()
    for tick_label in ax.get_xticklabels():
        median = medians[int(tick_label.get_text())]
        ax.annotate(
            f'{median / 1e6:.1f} M',
            xy=(tick_label.get_position()[0], median),
            ha='center', va='bottom'
        )

    return ax

ex3()

<Axes: title={'center': 'TSA Traveler Throughput'}, xlabel='year', ylabel='travelers'>

	name	id	nametype	recclass	mass	fall	year	reclat	reclong	geolocation	:@computed_region_cbhk_fwbd	:@computed_region_nnqa_25f4
0	Aachen	1	Valid	L5	21	Fell	1880-01-01T00:00:00.000	50.775000	6.083330	{'latitude': '50.775', 'longitude': '6.08333'}	NaN	NaN
1	Aarhus	2	Valid	H6	720	Fell	1951-01-01T00:00:00.000	56.183330	10.233330	{'latitude': '56.18333', 'longitude': '10.23333'}	NaN	NaN
2	Abee	6	Valid	EH4	107000	Fell	1952-01-01T00:00:00.000	54.216670	-113.000000	{'latitude': '54.21667', 'longitude': '-113.0'}	NaN	NaN

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation
45711	Zillah 002	31356	Valid	Eucrite	172.0	Found	01/01/1990 12:00:00 AM	29.03700	17.01850	(29.037, 17.0185)
45712	Zinder	30409	Valid	Pallasite, ungrouped	46.0	Found	01/01/1999 12:00:00 AM	13.78333	8.96667	(13.78333, 8.96667)
45713	Zlin	30410	Valid	H4	3.3	Found	01/01/1939 12:00:00 AM	49.25000	17.66667	(49.25, 17.66667)
45714	Zubkovsky	31357	Valid	L6	2167.0	Found	01/01/2003 12:00:00 AM	49.78917	41.50460	(49.78917, 41.5046)
45715	Zulu Queen	30414	Valid	L3.7	200.0	Found	01/01/1976 12:00:00 AM	33.98333	-115.68333	(33.98333, -115.68333)

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation
100	Benton	5026	Valid	LL6	2840.0	Fell	01/01/1949 12:00:00 AM	45.95000	-67.55000	(45.95, -67.55)
101	Berduc	48975	Valid	L6	270.0	Fell	01/01/2008 12:00:00 AM	-31.91000	-58.32833	(-31.91, -58.32833)
102	Béréba	5028	Valid	Eucrite-mmict	18000.0	Fell	01/01/1924 12:00:00 AM	11.65000	-3.65000	(11.65, -3.65)
103	Berlanguillas	5029	Valid	L6	1440.0	Fell	01/01/1811 12:00:00 AM	41.68333	-3.80000	(41.68333, -3.8)

	name	recclass	mass (g)	year
100	Benton	LL6	2840.0	01/01/1949 12:00:00 AM
101	Berduc	L6	270.0	01/01/2008 12:00:00 AM
102	Béréba	Eucrite-mmict	18000.0	01/01/1924 12:00:00 AM
103	Berlanguillas	L6	1440.0	01/01/1811 12:00:00 AM

	mass (g)	fall	year
100	2840.0	Fell	01/01/1949 12:00:00 AM
101	270.0	Fell	01/01/2008 12:00:00 AM
102	18000.0	Fell	01/01/1924 12:00:00 AM
103	1440.0	Fell	01/01/1811 12:00:00 AM
104	960.0	Fell	01/01/2004 12:00:00 AM

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation
0	Aachen	1	Valid	L5	21.0	Fell	01/01/1880 12:00:00 AM	50.77500	6.08333	(50.775, 6.08333)
1	Aarhus	2	Valid	H6	720.0	Fell	01/01/1951 12:00:00 AM	56.18333	10.23333	(56.18333, 10.23333)
2	Abee	6	Valid	EH4	107000.0	Fell	01/01/1952 12:00:00 AM	54.21667	-113.00000	(54.21667, -113.0)
3	Acapulco	10	Valid	Acapulcoite	1914.0	Fell	01/01/1976 12:00:00 AM	16.88333	-99.90000	(16.88333, -99.9)
4	Achiras	370	Valid	L6	780.0	Fell	01/01/1902 12:00:00 AM	-33.16667	-64.95000	(-33.16667, -64.95)

	vendorid	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	ratecodeid	store_and_fwd_flag	pulocationid	dolocationid	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge
0	2	2019-10-23T16:39:42.000	2019-10-23T17:14:10.000	1	7.93	1	N	138	170	1	29.5	1.0	0.5	7.98	6.12	0.3	47.90	2.5
1	1	2019-10-23T16:32:08.000	2019-10-23T16:45:26.000	1	2.00	1	N	11	26	1	10.5	1.0	0.5	0.00	0.00	0.3	12.30	0.0
2	2	2019-10-23T16:08:44.000	2019-10-23T16:21:11.000	1	1.36	1	N	163	162	1	9.5	1.0	0.5	2.00	0.00	0.3	15.80	2.5
3	2	2019-10-23T16:22:44.000	2019-10-23T16:43:26.000	1	1.00	1	N	170	163	1	13.0	1.0	0.5	4.32	0.00	0.3	21.62	2.5
4	2	2019-10-23T16:45:11.000	2019-10-23T16:58:49.000	1	1.96	1	N	163	236	1	10.5	1.0	0.5	0.50	0.00	0.3	15.30	2.5

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation
29	Allende	2278	Valid	CV3	2000000.0	Fell	01/01/1969 12:00:00 AM	26.96667	-105.31667	(26.96667, -105.31667)
419	Jilin	12171	Valid	H5	4000000.0	Fell	01/01/1976 12:00:00 AM	44.05000	126.16667	(44.05, 126.16667)
506	Kunya-Urgench	12379	Valid	H5	1100000.0	Fell	01/01/1998 12:00:00 AM	42.25000	59.20000	(42.25, 59.2)
707	Norton County	17922	Valid	Aubrite	1100000.0	Fell	01/01/1948 12:00:00 AM	39.68333	-99.86667	(39.68333, -99.86667)
920	Sikhote-Alin	23593	Valid	Iron, IIAB	23000000.0	Fell	01/01/1947 12:00:00 AM	46.16000	134.65333	(46.16, 134.65333)

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation
count	45716	45716.000000	45716	45716	4.558500e+04	45716	45425	38401.000000	38401.000000	38401
unique	45716	NaN	2	466	NaN	2	266	NaN	NaN	17100
top	Aachen	NaN	Valid	L6	NaN	Found	01/01/2003 12:00:00 AM	NaN	NaN	(0.0, 0.0)
freq	1	NaN	45641	8285	NaN	44609	3323	NaN	NaN	6214
mean	NaN	26889.735104	NaN	NaN	1.327808e+04	NaN	NaN	-39.122580	61.074319	NaN
std	NaN	16860.683030	NaN	NaN	5.749889e+05	NaN	NaN	46.378511	80.647298	NaN
min	NaN	1.000000	NaN	NaN	0.000000e+00	NaN	NaN	-87.366670	-165.433330	NaN
25%	NaN	12688.750000	NaN	NaN	7.200000e+00	NaN	NaN	-76.714240	0.000000	NaN
50%	NaN	24261.500000	NaN	NaN	3.260000e+01	NaN	NaN	-71.500000	35.666670	NaN
75%	NaN	40656.750000	NaN	NaN	2.026000e+02	NaN	NaN	0.000000	157.166670	NaN
max	NaN	57458.000000	NaN	NaN	6.000000e+07	NaN	NaN	81.166670	354.473330	NaN

	fare_amount	tip_amount	tolls_amount	total_amount
count	10000.000000	10000.000000	10000.000000	10000.000000
mean	15.106313	2.634494	0.623447	22.564659
std	13.954762	3.409800	6.437507	19.209255
min	-52.000000	0.000000	-6.120000	-65.920000
25%	7.000000	0.000000	0.000000	12.375000
50%	10.000000	2.000000	0.000000	16.300000
75%	16.000000	3.250000	0.000000	22.880000
max	176.000000	43.000000	612.000000	671.800000

	pickup	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
0	2019-10-23 16:39:42	2019-10-23 17:14:10	1	7.93	1	29.5	1.0	0.5	7.98	6.12	0.3	47.9	2.5	0 days 00:34:28	39.92	0.1999	10.42	13.804642
1	2019-10-23 16:32:08	2019-10-23 16:45:26	1	2.00	1	10.5	1.0	0.5	0.00	0.00	0.3	12.3	0.0	0 days 00:13:18	12.30	0.0000	1.80	9.022556

	pickup	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
5997	2019-10-23 15:55:19	2019-10-23 16:08:25	6	1.58	2	10.0	1.0	0.5	0.0	0.3	14.3	2.5	0 days 00:13:06	14.3	0.000000	4.3	7.236641
443	2019-10-23 15:56:59	2019-10-23 16:04:33	6	1.46	2	7.5	1.0	0.5	0.0	0.3	11.8	2.5	0 days 00:07:34	11.8	0.000000	4.3	11.577093
8722	2019-10-23 15:57:33	2019-10-23 16:03:34	6	0.62	1	5.5	1.0	0.5	0.7	0.3	10.5	2.5	0 days 00:06:01	9.8	0.071429	4.3	6.182825
4198	2019-10-23 15:57:38	2019-10-23 16:05:07	6	1.18	1	7.0	1.0	0.5	1.0	0.3	12.3	2.5	0 days 00:07:29	11.3	0.088496	4.3	9.461024
8238	2019-10-23 15:58:31	2019-10-23 16:29:29	6	3.23	2	19.5	1.0	0.5	0.0	0.3	23.8	2.5	0 days 00:30:58	23.8	0.000000	4.3	6.258342

	pickup	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	fees	avg_speed
7576	2019-10-23 16:52:51	2019-10-24 16:51:44	1	3.75	1	17.5	1.0	0.5	0.3	21.8	2.5	0 days 23:58:53	21.8	4.3	0.156371
6902	2019-10-23 16:51:42	2019-10-24 16:50:22	1	11.19	2	39.5	1.0	0.5	0.3	41.3	0.0	0 days 23:58:40	41.3	1.8	0.466682
4975	2019-10-23 16:18:51	2019-10-24 16:17:30	1	0.70	2	7.0	1.0	0.5	0.3	11.3	2.5	0 days 23:58:39	11.3	4.3	0.029194

	name	id	nametype	recclass	mass	fall	year
16392	Hoba	11890	Valid	Iron, IVB	60000000.0	Found	01/01/1920 12:00:00 AM
5373	Cape York	5262	Valid	Iron, IIIAB	58200000.0	Found	01/01/1818 12:00:00 AM
5365	Campo del Cielo	5247	Valid	Iron, IAB-MG	50000000.0	Found	12/22/1575 12:00:00 AM
5370	Canyon Diablo	5257	Valid	Iron, IAB-MG	30000000.0	Found	01/01/1891 12:00:00 AM
3455	Armanty	2335	Valid	Iron, IIIE	28000000.0	Found	01/01/1898 12:00:00 AM

	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
pickup
2019-10-23 16:39:42	2019-10-23 17:14:10	1	7.93	1	29.5	1.0	0.5	7.98	6.12	0.3	47.9	2.5	0 days 00:34:28	39.92	0.199900	10.42	13.804642
2019-10-23 16:32:08	2019-10-23 16:45:26	1	2.00	1	10.5	1.0	0.5	0.00	0.00	0.3	12.3	0.0	0 days 00:13:18	12.30	0.000000	1.80	9.022556
2019-10-23 16:08:44	2019-10-23 16:21:11	1	1.36	1	9.5	1.0	0.5	2.00	0.00	0.3	15.8	2.5	0 days 00:12:27	13.80	0.144928	4.30	6.554217

	name	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation	pre_1970
id
10036	Enigma	Valid	H4	94.0	Found	1967.0	31.33333	-82.31667	(31.33333, -82.31667)	False
10037	Enon	Valid	Iron, ungrouped	763.0	Found	1883.0	39.86667	-83.95000	(39.86667, -83.95)	False
10038	Enshi	Valid	H5	8000.0	Fell	1974.0	30.30000	109.50000	(30.3, 109.5)	False
10039	Ensisheim	Valid	LL6	127000.0	Fell	1491.0	47.86667	7.35000	(47.86667, 7.35)	True

	Date	2021 Traveler Throughput	2020 Traveler Throughput	2019 Traveler Throughput
0	2021-05-14	1716561.0	250467	2664549
1	2021-05-13	1743515.0	234928	2611324
2	2021-05-12	1424664.0	176667	2343675
3	2021-05-11	1315493.0	163205	2191387
4	2021-05-10	1657722.0	215645	2512315

	date	2021	2020	2019
0	2021-05-14	1716561.0	250467	2664549
1	2021-05-13	1743515.0	234928	2611324
2	2021-05-12	1424664.0	176667	2343675
3	2021-05-11	1315493.0	163205	2191387
4	2021-05-10	1657722.0	215645	2512315

	date	2021	2020	2019
0	2021-05-14 00:00:00	1716561.000000	250467	2664549
1	2021-05-13 00:00:00	1743515.000000	234928	2611324

	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
pickup
2019-10-23 07:48:58	2019-10-23 07:52:09	1	0.67	2	4.5	1.0	0.5	0.0	0.0	0.3	8.8	2.5	0 days 00:03:11	8.8	0.000000	4.3	12.628272
2019-10-23 08:02:09	2019-10-24 07:42:32	1	8.38	1	32.0	1.0	0.5	5.5	0.0	0.3	41.8	2.5	0 days 23:40:23	36.3	0.151515	4.3	0.353989
2019-10-23 08:18:47	2019-10-23 08:36:05	1	2.39	2	12.5	1.0	0.5	0.0	0.0	0.3	16.8	2.5	0 days 00:17:18	16.8	0.000000	4.3	8.289017

	pickup	dropoff	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
0	2019-10-23 07:05:34	2019-10-23 08:03:16	3	14.68	1	50.0	1.0	0.5	4.0	0.3	55.8	0.0	0 days 00:57:42	51.8	0.077220	1.8	15.265165
1	2019-10-23 07:48:58	2019-10-23 07:52:09	1	0.67	2	4.5	1.0	0.5	0.0	0.3	8.8	2.5	0 days 00:03:11	8.8	0.000000	4.3	12.628272
2	2019-10-23 08:02:09	2019-10-24 07:42:32	1	8.38	1	32.0	1.0	0.5	5.5	0.3	41.8	2.5	0 days 23:40:23	36.3	0.151515	4.3	0.353989
3	2019-10-23 08:18:47	2019-10-23 08:36:05	1	2.39	2	12.5	1.0	0.5	0.0	0.3	16.8	2.5	0 days 00:17:18	16.8	0.000000	4.3	8.289017
4	2019-10-23 09:27:16	2019-10-23 09:33:13	2	1.11	2	6.0	1.0	0.5	0.0	0.3	7.8	0.0	0 days 00:05:57	7.8	0.000000	1.8	11.193277

	name	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation	pre_1970
id
57150	Northwest Africa 7701	Valid	CK6	55.0	Found	2101.0	0.0	0.0	(0.0, 0.0)	False

	date	year	travelers
974	2020-09-12	2019	1879822.0
435	2021-03-05	2020	2198517.0
1029	2020-07-19	2019	2727355.0
680	2020-07-03	2020	718988.0
867	2020-12-28	2019	2500396.0

	date	year	travelers
974	2019-09-12	2019	1879822.0
435	2020-03-05	2020	2198517.0
1029	2019-07-19	2019	2727355.0
680	2020-07-03	2020	718988.0
867	2019-12-28	2019	2500396.0

	date	year	travelers
136	2021-12-29	2021	NaN
135	2021-12-30	2021	NaN
134	2021-12-31	2021	NaN

day_in_march	1	2	3	4	5	6	7	8	9	10
year
2019	2257920.0	1979558.0	2143619.0	2402692.0	2543689.0	2156262.0	2485430.0	2378673.0	2122898.0	2187298.0
2020	2089641.0	1736393.0	1877401.0	2130015.0	2198517.0	1844811.0	2119867.0	1909363.0	1617220.0	1702686.0
2021	1049692.0	744812.0	826924.0	1107534.0	1168734.0	992406.0	1278557.0	1119303.0	825745.0	974221.0

year	2019	2020	2021
day_in_march
1	2257920.0	2089641.0	1049692.0
2	1979558.0	1736393.0	744812.0
3	2143619.0	1877401.0	826924.0
4	2402692.0	2130015.0	1107534.0
5	2543689.0	2198517.0	1168734.0
6	2156262.0	1844811.0	992406.0
7	2485430.0	2119867.0	1278557.0
8	2378673.0	1909363.0	1119303.0
9	2122898.0	1617220.0	825745.0
10	2187298.0	1702686.0	974221.0

	holiday
date
2019-01-01	New Year's Day
2019-05-27	Memorial Day
2019-07-04	July 4th
2019-09-02	Labor Day
2019-11-28	Thanksgiving
2019-12-24	Christmas Eve
2019-12-25	Christmas Day
2019-12-31	New Year's Eve

	date	year	travelers	holiday
863	2019-01-01	2019	2126398.0	New Year's Day
862	2019-01-02	2019	2345103.0	NaN
861	2019-01-03	2019	2202111.0	NaN
860	2019-01-04	2019	2150571.0	NaN
859	2019-01-05	2019	1975947.0	NaN

Introduction to Data Analysis Using Pandas¶

Stefanie Molin¶

Bio¶

Prerequisites¶

Session Outline¶

Let's get started¶

Section 1: Getting Started With Pandas¶

Learning Path¶

Anatomy of a DataFrame¶

Series:¶

Columns:¶

Index:¶

Learning Path¶

Creating DataFrames¶

Using a flat file¶

Using data from an API¶

Learning Path¶

Inspecting the data¶

How many rows and columns are there?¶

What are the column names?¶

What type of data does each column currently hold?¶

What does the data look like?¶

Get some information about the DataFrame¶

Exercise 1.1¶

Create a DataFrame by reading in the 2019_Yellow_Taxi_Trip_Data.csv file. Examine the first 5 rows.¶

Exercise 1.2¶

Find the dimensions (number of rows and number of columns) in the data.¶

Solutions¶

Exercise 1.1 – Create a DataFrame by reading in the 2019_Yellow_Taxi_Trip_Data.csv file. Examine the first 5 rows:¶

Exercise 1.2 – Find the dimensions (number of rows and number of columns) in the data:¶

Learning Path¶

Extracting subsets¶

Selecting columns¶

Selecting rows¶

Indexing¶

Filtering with Boolean masks¶

Learning Path¶

Calculating summary statistics¶

How many of the meteorites were found versus observed falling?¶

What was the mass of the average meterorite?¶

What was the mass of the heaviest meteorite?¶

How many different types of meteorite classes are represented in this dataset?¶

Get some summary statistics on the data itself¶

Check out the documentation for more descriptive statistics:¶

Exercise 1.3¶

Using the data in the 2019_Yellow_Taxi_Trip_Data.csv file, calculate summary statistics for the fare_amount, tip_amount, tolls_amount, and total_amount columns.¶

Exercise 1.4¶

Isolate the fare_amount, tip_amount, tolls_amount, and total_amount for the longest trip by distance (trip_distance).¶

Solutions¶

Exercise 1.3 – Using the data in the 2019_Yellow_Taxi_Trip_Data.csv file, calculate summary statistics for the fare_amount, tip_amount, tolls_amount, and total_amount columns:¶

Exercise 1.4 – Isolate the fare_amount, tip_amount, tolls_amount, and total_amount for the longest trip by distance (trip_distance):¶

Section 1 Complete 🎉¶

Section 2: Data Wrangling¶

Learning Path¶

Data cleaning¶

Dropping columns¶

Renaming columns¶

Type conversion¶

Creating new columns¶

Sorting by values¶

Exercise 2.1¶

Read in the meteorite data from the Meteorite_Landings.csv file, rename the mass (g) column to mass, and drop all the latitude and longitude columns. Sort the result by mass in descending order.¶

Solution¶

Learning Path¶

Working with the index¶

Setting and sorting the index¶

Resetting the index¶

Exercise 2.2¶

Hint 1: Use year.str.slice() to grab a substring.¶

Hint 2: Make sure to sort the index before using loc[] to select the range.¶

Bonus: There's a data entry error in the year column. Can you find it? (Don't spend too much time on this.)¶

Solution¶

Bonus: There's a data entry error in the year column. Can you find it?¶

Oops! This meteorite actually was found in 2010 (more information here).¶

Learning Path¶

Reshaping data¶

Melting¶

Pivoting¶

Transposing¶

Merging¶

Create a DataFrame by reading in the `2019_Yellow_Taxi_Trip_Data.csv` file. Examine the first 5 rows.¶

Exercise 1.1 – Create a DataFrame by reading in the `2019_Yellow_Taxi_Trip_Data.csv` file. Examine the first 5 rows:¶

Using the data in the `2019_Yellow_Taxi_Trip_Data.csv` file, calculate summary statistics for the `fare_amount`, `tip_amount`, `tolls_amount`, and `total_amount` columns.¶

Isolate the `fare_amount`, `tip_amount`, `tolls_amount`, and `total_amount` for the longest trip by distance (`trip_distance`).¶

Exercise 1.3 – Using the data in the `2019_Yellow_Taxi_Trip_Data.csv` file, calculate summary statistics for the `fare_amount`, `tip_amount`, `tolls_amount`, and `total_amount` columns:¶

Exercise 1.4 – Isolate the `fare_amount`, `tip_amount`, `tolls_amount`, and `total_amount` for the longest trip by distance (`trip_distance`):¶

Read in the meteorite data from the `Meteorite_Landings.csv` file, rename the `mass (g)` column to `mass`, and drop all the latitude and longitude columns. Sort the result by mass in descending order.¶

Hint 1: Use `year.str.slice()` to grab a substring.¶

Hint 2: Make sure to sort the index before using `loc[]` to select the range.¶

Bonus: There's a data entry error in the `year` column. Can you find it? (Don't spend too much time on this.)¶

Bonus: There's a data entry error in the `year` column. Can you find it?¶

Using the meteorite data from the `Meteorite_Landings.csv` file, compare summary statistics of the mass column for the meteorites that were found versus observed falling.¶

Using the taxi trip data in the `2019_Yellow_Taxi_Trip_Data.csv` file, resample the data to an hourly frequency based on the dropoff time. Calculate the total `trip_distance`, `fare_amount`, `tolls_amount`, and `tip_amount`, then find the 5 hours with the most tips.¶

Using the TSA traveler throughput data in the `tsa_melted_holiday_travel.csv` file, create box plots for traveler throughput for each year in the data. Hint: Pass `kind='box'` into the `plot()` method to generate box plots.¶

Using the TSA traveler throughput data in the `tsa_melted_holiday_travel.csv` file, create a heatmap that shows the 2019 TSA median traveler throughput by day of week and month.¶

	date	year	travelers	holiday
899	2019-11-26	2019	1591158.0	Thanksgiving
898	2019-11-27	2019	1968137.0	Thanksgiving
897	2019-11-28	2019	2648268.0	Thanksgiving
896	2019-11-29	2019	2882915.0	Thanksgiving
873	2019-12-22	2019	1981433.0	Christmas Eve
872	2019-12-23	2019	1937235.0	Christmas Eve
871	2019-12-24	2019	2552194.0	Christmas Eve
870	2019-12-25	2019	2582580.0	Christmas Day
869	2019-12-26	2019	2470786.0	Christmas Day

holiday	New Year's Day	Memorial Day	July 4th	Labor Day	Thanksgiving	Christmas Eve	Christmas Day	New Year's Eve
year
2019	4471501.0	9720691.0	9414228.0	8314811.0	9090478.0	6470862.0	5053366.0	6535464.0
2020	4490388.0	1126253.0	2682541.0	2993653.0	3364358.0	3029810.0	1745242.0	3057449.0
2021	1998871.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN

holiday	Memorial Day	July 4th	Labor Day	Thanksgiving	Christmas	New Year's	Total
year
2018	NaN	NaN	NaN	NaN	NaN	4,471,501	4,471,501
2019	9,720,691	9,414,228	8,314,811	9,090,478	11,524,228	11,025,852	59,090,288
2020	1,126,253	2,682,541	2,993,653	3,364,358	4,775,052	5,056,320	19,998,177
Total	10,846,944	12,096,769	11,308,464	12,454,836	16,299,280	20,553,673	83,559,966

	holiday_travelers		holiday
	mean	std	nunique	count
year
2019	2.271977e+06	303021.675751	8	26
2020	8.649882e+05	489938.240989	8	26
2021	9.994355e+05	273573.249680	1	2

	count	mean	std	min	25%	50%	75%	max
fall
Fell	1075.0	47070.715023	717067.125826	0.1	686.00	2800.0	10450.0	23000000.0
Found	44510.0	12461.922983	571105.752311	0.0	6.94	30.5	178.0	60000000.0

	pickup	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
dropoff
2019-10-24 12:30:08	2019-10-23 13:25:42	4	0.76	2	5.0	1.0	0.5	0.00	0.0	0.3	9.30	2.5	0 days 23:04:26	9.3	0.0	4.3	0.032938
2019-10-24 12:42:01	2019-10-23 13:34:03	2	1.58	1	7.5	1.0	0.5	2.36	0.0	0.3	14.16	2.5	0 days 23:07:58	11.8	0.2	4.3	0.068301

	pickup	passenger_count	trip_distance	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount	congestion_surcharge	elapsed_time	cost_before_tip	tip_pct	fees	avg_speed
dropoff
2019-10-23 12:53:49	2019-10-23 12:35:27	5	2.49	1	13.5	1.0	0.5	2.20	0.0	0.3	20.00	2.5	0 days 00:18:22	17.8	0.123596	4.3	8.134301
2019-10-24 12:30:08	2019-10-23 13:25:42	4	0.76	2	5.0	1.0	0.5	0.00	0.0	0.3	9.30	2.5	0 days 23:04:26	9.3	0.000000	4.3	0.032938
2019-10-24 12:42:01	2019-10-23 13:34:03	2	1.58	1	7.5	1.0	0.5	2.36	0.0	0.3	14.16	2.5	0 days 23:07:58	11.8	0.200000	4.3	0.068301

	year	travelers	holiday	one_day_change	seven_day_change
date
2020-01-01	2020	2311732.0	New Year's Day	NaN	NaN
2020-01-02	2020	2178656.0	New Year's Day	-133076.0	NaN
2020-01-03	2020	2422272.0	NaN	243616.0	NaN
2020-01-04	2020	2210542.0	NaN	-211730.0	NaN
2020-01-05	2020	1806480.0	NaN	-404062.0	NaN
2020-01-06	2020	1815040.0	NaN	8560.0	NaN
2020-01-07	2020	2034472.0	NaN	219432.0	NaN
2020-01-08	2020	2072543.0	NaN	38071.0	-239189.0
2020-01-09	2020	1687974.0	NaN	-384569.0	-490682.0
2020-01-10	2020	2183734.0	NaN	495760.0	-238538.0

	travelers
	sum	mean	std
date
2019-03-31	189281658.0	2.103130e+06	282239.618354
2019-06-30	221756667.0	2.436886e+06	212600.697665
2019-09-30	220819236.0	2.400209e+06	260140.242892
2019-12-31	211103512.0	2.294603e+06	260510.040655
2020-03-31	155354148.0	1.726157e+06	685094.277420
2020-06-30	25049083.0	2.752646e+05	170127.402046
2020-09-30	63937115.0	6.949686e+05	103864.705739
2020-12-31	77541248.0	8.428397e+05	170245.484185
2021-03-31	86094635.0	9.566071e+05	280399.809061

	trip_distance	fare_amount	tolls_amount	tip_amount
tpep_dropoff_datetime
2019-10-23 16:00:00	10676.95	67797.76	699.04	12228.64
2019-10-23 17:00:00	16052.83	70131.91	4044.04	12044.03
2019-10-23 18:00:00	3104.56	11565.56	1454.67	1907.64
2019-10-23 15:00:00	14.34	213.50	0.00	51.75
2019-10-23 19:00:00	98.59	268.00	24.48	25.74