"""Class representing a dataset for morphing."""
from numbers import Number
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.axes import Axes
from ..bounds.bounding_box import BoundingBox
from ..bounds.interval import Interval
from ..plotting.style import plot_with_custom_style
[docs]
class Dataset:
"""
Class for representing a starting dataset and bounds.
.. plot::
:caption:
Upon creation, these bounds are automatically calculated.
Use :meth:`plot` to generate this visualization.
from data_morph.data.loader import DataLoader
_ = DataLoader.load_dataset('panda').plot()
Parameters
----------
name : str
The name to use for the dataset.
df : pandas.DataFrame
DataFrame containing columns x and y.
scale : numbers.Number, optional
The factor to scale the data by (can be used to speed up morphing).
Values in the data's x and y columns will be divided by this value.
See Also
--------
:class:`.DataLoader`
Utility for creating :class:`Dataset` objects from CSV files.
"""
_REQUIRED_COLUMNS = ['x', 'y']
def __init__(
self,
name: str,
df: pd.DataFrame,
scale: Number = None,
) -> None:
self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale)
"""pandas.DataFrame: DataFrame containing columns x and y."""
self.name: str = name
"""str: The name to use for the dataset."""
self.data_bounds: BoundingBox = self._derive_data_bounds()
"""BoundingBox: The bounds of the data."""
self.morph_bounds: BoundingBox = self._derive_morphing_bounds()
"""BoundingBox: The limits for the morphing process."""
self.plot_bounds: BoundingBox = self._derive_plotting_bounds()
"""BoundingBox: The bounds to use when plotting the morphed data."""
def __repr__(self) -> str:
return f'<{self.__class__.__name__} name={self.name} scaled={self._scaled}>'
def _derive_data_bounds(self) -> None:
"""
Derive bounds based on the data.
Returns
-------
BoundingBox
The bounds of the data.
"""
return BoundingBox(
*[
Interval([self.df[dim].min(), self.df[dim].max()], inclusive=False)
for dim in self._REQUIRED_COLUMNS
]
)
def _derive_morphing_bounds(self) -> None:
"""
Derive morphing bounds based on the data.
Returns
-------
BoundingBox
The bounds of the morphing process.
"""
# TODO: range * 0.2 is still a bit arbitrary (need to take into account density at the edges)
# could also make this a parameter to __init__()
morph_bounds = self.data_bounds.clone()
x_offset, y_offset = [offset * 0.2 for offset in self.data_bounds.range]
morph_bounds.adjust_bounds(x=x_offset, y=y_offset)
return morph_bounds
def _derive_plotting_bounds(self) -> None:
"""
Derive plotting bounds based on the morphing bounds.
Returns
-------
BoundingBox
The bounds of the plot.
"""
# TODO: range * 0.2 is still a bit arbitrary (need to take into account density at the edges)
# could also make this a parameter to __init__()
x_offset, y_offset = [offset * 0.2 for offset in self.data_bounds.range]
plot_bounds = self.morph_bounds.clone()
plot_bounds.adjust_bounds(x=x_offset, y=y_offset)
plot_bounds.align_aspect_ratio()
return plot_bounds
def _scale_data(self, df, scale: Number) -> pd.DataFrame:
"""
Apply scaling to the data.
Parameters
----------
df : pandas.DataFrame
The data to scale.
scale : numbers.Number, optional
The factor to scale the data by (can be used to speed up morphing).
Values in the data's x and y columns will be divided by this value.
Returns
-------
pandas.DataFrame
The scaled data.
"""
if scale is None:
self._scaled = False
return df
if isinstance(scale, bool) or not isinstance(scale, Number):
raise TypeError('scale must be a numeric value.')
if not scale:
raise ValueError('scale must be non-zero.')
scaled_df = df.assign(x=df.x.div(scale), y=df.y.div(scale))
self._scaled = True
return scaled_df
def _validate_data(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Validate the data.
Parameters
----------
data : pandas.DataFrame
DataFrame for morphing.
Returns
-------
pandas.DataFrame
DataFrame provided it contains columns x and y.
"""
required = set(self._REQUIRED_COLUMNS)
missing_columns = required.difference(data.columns)
if missing_columns:
case_insensitive_missing = missing_columns.difference(
data.columns.str.lower()
)
if case_insensitive_missing:
raise ValueError(
'Columns "x" and "y" are required for datasets. The provided '
'dataset is missing the following column(s): '
f"{', '.join(sorted(missing_columns))}."
)
data = data.rename(columns={col.upper(): col for col in missing_columns})
return data
[docs]
@plot_with_custom_style
def plot(self, ax: Axes = None) -> Axes:
"""
Plot the dataset and its bounds.
Parameters
----------
ax : matplotlib.axes.Axes, optional
An optional :class:`~matplotlib.axes.Axes` object to plot on.
Returns
-------
matplotlib.axes.Axes
The :class:`~matplotlib.axes.Axes` object containing the plot.
"""
if not ax:
fig, ax = plt.subplots(layout='constrained')
fig.get_layout_engine().set(w_pad=0.2, h_pad=0.2)
ax.axis('equal')
ax.scatter(self.df.x, self.df.y, s=2, color='black')
ax.set(xlabel='', ylabel='', title=self)
scale_base = 85
# data bounds
x_offset = self.data_bounds.x_bounds.range / scale_base
y_offset = self.data_bounds.y_bounds.range / scale_base
data_rectangle = [
self.data_bounds.x_bounds[0] - x_offset,
self.data_bounds.y_bounds[0] - y_offset,
]
ax.add_patch(
plt.Rectangle(
data_rectangle,
width=self.data_bounds.x_bounds.range + x_offset * 2,
height=self.data_bounds.y_bounds.range + y_offset * 2,
ec='blue',
linewidth=2,
fill=False,
)
)
ax.text(
(self.df.x.max() + self.df.x.min()) / 2,
self.df.y.max() + self.data_bounds.y_bounds.range / scale_base,
'DATA BOUNDS',
color='blue',
va='bottom',
ha='center',
)
# morph bounds
morph_rectangle = [self.morph_bounds.x_bounds[0], self.morph_bounds.y_bounds[0]]
ax.add_patch(
plt.Rectangle(
morph_rectangle,
width=self.morph_bounds.x_bounds.range,
height=self.morph_bounds.y_bounds.range,
ec='red',
linewidth=2,
fill=False,
)
)
ax.text(*morph_rectangle, ' MORPH BOUNDS', color='red', va='bottom', ha='left')
# plot bounds
plot_rectangle = [self.plot_bounds.x_bounds[0], self.plot_bounds.y_bounds[0]]
ax.add_patch(
plt.Rectangle(
plot_rectangle,
width=self.plot_bounds.x_bounds.range,
height=self.plot_bounds.y_bounds.range,
ec='#7CA1CC',
linewidth=2,
fill=False,
)
)
ax.text(
*plot_rectangle, ' PLOT BOUNDS', color='#7CA1CC', va='bottom', ha='left'
)
ax.autoscale()
return ax