import altair as alt
import pandas as pd
[docs]def categorical_eda(
data,
xval,
plot_type="histogram",
color=None,
title=None,
font_size=10,
color_scheme="tableau20",
plot_height=150,
plot_width=200,
opacity=1,
facet_factor=None,
facet_col=None,
):
"""
This function takes in a data frame object and one categorical
feature, to produce a histogram plot that visualizes the
distribution of the feature. User can also choose to plot density
graph of the feature by specifing in plot_type.
The function also offers customization on color, plot title,
font size, color-scheme, plot size and other common configurations.
Parameters
----------
data : pandas.core.frame.DataFrame
Input dataframe object.
xval : str
Variable used to represent the x-axis.
plot_type : str, optional
Variable used to specify plot type. Options include "histogram"
and "density". When "density" is selected, the variable yval becomes
obsolete.
color : str, optional
Variable used to set the color of the marks in the plot object.
tilte : str, optional
Variable used to set the title of the plot.
font_size : int, optional
Variable used to set the size of the axis labels and title.
color_scheme : str, optional
Variable used to set the bar size.
plot_height : int, optional
Variable used to specify plot height
plot_witdh : int, optional
Variable used to specify plot width
opacity : float, optional
Variable used to specify density fill opacity for density plot
facet_factor : str, optional
Variable used to specify facet factor
facet_col : int, optional
Variable used to specify number of facet columns
Returns
-------
`altair`
A histogram or density chart object based on user specifications.
Examples
--------
>>> import altair as alt
>>> import numpy as np
>>> import pandas as pd
>>> from simpler_eda.categorical_eda import categorical_plot
>>> from vega_datasets import data
>>> cars = data.cars()
>>> categorical_eda(data = cars,
xval = "Origin",
color = "Horsepower",
title = "Histogram of Origin in Different Levels of
Horsepower",
plot_height = 100,
plot_width = 200
)
"""
# Checking for valid inputs:
if not isinstance(data, pd.DataFrame):
raise Exception("the input data has to be a dataframe.")
if facet_factor is None and facet_col is not None:
raise Exception("facet_factor must be provided along with facet_col.")
if facet_factor is not None and facet_col is None:
raise Exception("Specify facet_col for facetting the plot")
if plot_type not in ["histogram", "density"]:
raise Exception("plot_type must be either 'histogram' or 'density'")
if opacity <= 0 or opacity > 1:
raise Exception("opacity must be in range (0, 1)")
if xval not in data.columns:
raise Exception("xval must be a feature in the input dataframe")
if color is not None and color not in data.columns:
raise Exception("color must be a feature in the input dataframe")
if facet_factor is None:
if plot_type == "histogram":
categorical_plot = (
alt.Chart(data=data, title=title)
.mark_bar()
.encode(
x=alt.X(xval),
y="count()",
color=alt.Color(
color, scale=alt.Scale(scheme=color_scheme)
),
)
.properties(width=plot_width, height=plot_height)
.configure_title(fontSize=font_size)
.configure_axis(
labelFontSize=font_size, titleFontSize=font_size
)
)
else:
categorical_plot = (
alt.Chart(data=data, title=title)
.transform_density(
xval, groupby=[color], as_=[xval, "density"]
)
.mark_area(opacity=opacity)
.encode(
x=xval,
y="density:Q",
color=alt.Color(
color, scale=alt.Scale(scheme=color_scheme)
),
)
.properties(width=plot_width, height=plot_height)
.configure_title(fontSize=font_size)
.configure_axis(
labelFontSize=font_size, titleFontSize=font_size
)
)
else:
if plot_type == "histogram":
categorical_plot = (
alt.Chart(data=data)
.mark_bar()
.encode(
x=alt.X(xval),
y="count()",
color=alt.Color(
color, scale=alt.Scale(scheme=color_scheme)
),
)
.properties(width=plot_width, height=plot_height)
.facet(facet_factor, columns=facet_col, title=title)
.configure_title(fontSize=font_size)
.configure_axis(
labelFontSize=font_size, titleFontSize=font_size
)
)
else:
categorical_plot = (
alt.Chart(data=data)
.transform_density(
xval, groupby=[color], as_=[xval, "density"]
)
.mark_area(opacity=opacity)
.encode(
x=xval,
y="density:Q",
color=alt.Color(
color, scale=alt.Scale(scheme=color_scheme)
),
)
.properties(width=plot_width, height=plot_height)
.facet(facet_factor, columns=facet_col, title=title)
.configure_title(fontSize=font_size)
.configure_axis(
labelFontSize=font_size, titleFontSize=font_size
)
)
return categorical_plot