"""
According to *On the theory of scales of measurement* by **S.S. Stevens**,
scales can be classified in four ways -- *nominal*, *ordinal*,
*interval* and *ratio*. Using current(2016) terminology, *nominal* data
is made up of unordered categories, *ordinal* data is made up of ordered
categories and the two can be classified as *discrete*. On the other hand
both *interval* and *ratio* data are *continuous*.
The scale classes below show how the rest of the Mizani package can be
used to implement the two categories of scales. The key tasks are
*training* and *mapping* and these correspond to the **train** and
**map** methods.
To train a scale on data means, to make the scale learn the limits of
the data. This is elaborate (or worthy of a dedicated method) for two
reasons:
- *Practical* -- data may be split up across more than one object,
yet all will be represented by a single scale.
- *Conceptual* -- training is a key action that may need to be inserted
into multiple locations of the data processing pipeline before a
graphic can be created.
To map data onto a scale means, to associate data values with
values(potential readings) on a scale. This is perhaps the most important
concept unpinning a scale.
The **apply** methods are simple examples of how to put it all together.
"""
import numpy as np
import pandas as pd
from .bounds import censor, rescale
from .utils import (
CONTINUOUS_KINDS,
DISCRETE_KINDS,
get_categories,
match,
min_max,
)
__all__ = ["scale_continuous", "scale_discrete"]
[docs]class scale_continuous:
"""
Continuous scale
"""
[docs] @classmethod
def apply(cls, x, palette, na_value=None, trans=None):
"""
Scale data continuously
Parameters
----------
x : array_like
Continuous values to scale
palette : callable ``f(x)``
Palette to use
na_value : object
Value to use for missing values.
trans : trans
How to transform the data before scaling. If
``None``, no transformation is done.
Returns
-------
out : array_like
Scaled values
"""
if trans is not None:
x = trans.transform(x)
limits = cls.train(x)
return cls.map(x, palette, limits, na_value)
[docs] @classmethod
def train(cls, new_data, old=None):
"""
Train a continuous scale
Parameters
----------
new_data : array_like
New values
old : array_like
Old range. Most likely a tuple of length 2.
Returns
-------
out : tuple
Limits(range) of the scale
"""
if not len(new_data):
return old
if not hasattr(new_data, "dtype"):
new_data = np.asarray(new_data)
if new_data.dtype.kind not in CONTINUOUS_KINDS:
raise TypeError("Discrete value supplied to continuous scale")
if old is not None:
new_data = np.hstack([new_data, old])
return min_max(new_data, na_rm=True, finite=True)
[docs] @classmethod
def map(cls, x, palette, limits, na_value=None, oob=censor):
"""
Map values to a continuous palette
Parameters
----------
x : array_like
Continuous values to scale
palette : callable ``f(x)``
palette to use
na_value : object
Value to use for missing values.
oob : callable ``f(x)``
Function to deal with values that are
beyond the limits
Returns
-------
out : array_like
Values mapped onto a palette
"""
x = oob(rescale(x, _from=limits))
pal = palette(x)
try:
pal[pd.isnull(x)] = na_value
except TypeError:
pal = [v if not pd.isnull(v) else na_value for v in pal]
return pal
[docs]class scale_discrete:
"""
Discrete scale
"""
[docs] @classmethod
def apply(cls, x, palette, na_value=None):
"""
Scale data discretely
Parameters
----------
x : array_like
Discrete values to scale
palette : callable ``f(x)``
Palette to use
na_value : object
Value to use for missing values.
Returns
-------
out : array_like
Scaled values
"""
limits = cls.train(x)
return cls.map(x, palette, limits, na_value)
[docs] @classmethod
def train(cls, new_data, old=None, drop=False, na_rm=False):
"""
Train a continuous scale
Parameters
----------
new_data : array_like
New values
old : array_like
Old range. List of values known to the scale.
drop : bool
Whether to drop(not include) unused categories
na_rm : bool
If ``True``, remove missing values. Missing values
are either ``NaN`` or ``None``.
Returns
-------
out : list
Values covered by the scale
"""
if not len(new_data):
return old
if old is None:
old = []
else:
old = list(old)
# Get the missing values (NaN & Nones) locations and remove them
nan_bool_idx = pd.isnull(new_data)
has_na = np.any(nan_bool_idx)
if not hasattr(new_data, "dtype"):
new_data = np.asarray(new_data)
new_data = new_data[~nan_bool_idx]
if new_data.dtype.kind not in DISCRETE_KINDS:
raise TypeError("Continuous value supplied to discrete scale")
# Train i.e. get the new values
if isinstance(new_data.dtype, pd.CategoricalDtype):
categories = get_categories(new_data)
if drop:
present = set(new_data.drop_duplicates())
new = [i for i in categories if i in present]
else:
new = list(categories)
else:
new = np.unique(new_data)
new.sort()
# update old
old_set = set(old)
if isinstance(new_data.dtype, pd.CategoricalDtype):
# The limits are in the order of the categories
all_set = old_set | set(new)
ordered_cats = categories.union(old, sort=False)
limits = [c for c in ordered_cats if c in all_set]
else:
limits = old + [i for i in new if (i not in old_set)]
# Add nan if required
has_na_limits = any(pd.isnull(limits))
if not has_na_limits and not na_rm and has_na:
limits.append(np.nan)
return limits
[docs] @classmethod
def map(cls, x, palette, limits, na_value=None):
"""
Map values to a discrete palette
Parameters
----------
palette : callable ``f(x)``
palette to use
x : array_like
Continuous values to scale
na_value : object
Value to use for missing values.
Returns
-------
out : array_like
Values mapped onto a palette
"""
n = len(limits)
pal = palette(n)[match(x, limits)]
try:
pal[pd.isnull(x)] = na_value
except TypeError:
pal = [v if not pd.isnull(v) else na_value for v in pal]
return pal