Histograms#

The following examples use the dummy data which is described here

b-tagging discriminant plot#

"""Produce histogram of discriminant from tagger output and labels."""

from __future__ import annotations

import numpy as np
from ftag import Flavours
from ftag.utils import get_discriminant

from puma import Histogram, HistogramPlot
from puma.utils import get_dummy_2_taggers, get_good_linestyles

# The line below generates dummy data which is similar to a NN output
df = get_dummy_2_taggers()

# Calculate discriminant scores for DIPS and RNNIP, and add them to the dataframe
disc_dips = get_discriminant(
    jets=df,
    tagger="dips",
    signal=Flavours["bjets"],
    flavours=Flavours.by_category("single-btag"),
    fraction_values={
        "fc": 0.018,
        "fu": 0.982,
        "ftau": 0,
    },
)
disc_rnnip = get_discriminant(
    jets=df,
    tagger="rnnip",
    signal=Flavours["bjets"],
    flavours=Flavours.by_category("single-btag"),
    fraction_values={
        "fc": 0.018,
        "fu": 0.982,
        "ftau": 0,
    },
)

# defining boolean arrays to select the different flavour classes
is_light = df["HadronConeExclTruthLabelID"] == 0
is_c = df["HadronConeExclTruthLabelID"] == 4
is_b = df["HadronConeExclTruthLabelID"] == 5

taggers = ["dips", "rnnip"]
discs = {"dips": disc_dips, "rnnip": disc_rnnip}
linestyles = get_good_linestyles()[:2]

# Initialise histogram plot
plot_histo = HistogramPlot(
    n_ratio_panels=1,
    ylabel="Normalised number of jets",
    ylabel_ratio=["Ratio to DIPS"],
    xlabel="$b$-jet discriminant",
    logy=False,
    leg_ncol=1,
    figsize=(5.5, 4.5),
    y_scale=1.5,
    ymax_ratio=[1.5],
    ymin_ratio=[0.5],
    atlas_second_tag="$\\sqrt{s}=13$ TeV, dummy jets \ndummy sample, $f_{c}=0.018$",
)

# Add the histograms
for tagger, linestyle in zip(taggers, linestyles):
    plot_histo.add(
        Histogram(
            values=discs[tagger][is_light],
            # Only specify the label for the case of the "DIPS" light-jets, since we
            # want to hide the legend entry for "RNNIP" light-jets as it has the same
            # linecolour. Instead, we specify a "linestyle legend" further down in the
            # script
            bins=np.linspace(-10, 10, 50),
            label="Light-flavour jets" if tagger == "dips" else None,
            colour=Flavours["ujets"].colour,
            ratio_group="ujets",
            linestyle=linestyle,
        ),
        reference=tagger == "dips",
    )
    plot_histo.add(
        Histogram(
            values=discs[tagger][is_c],
            bins=np.linspace(-10, 10, 50),
            label="$c$-jets" if tagger == "dips" else None,
            colour=Flavours["cjets"].colour,
            ratio_group="cjets",
            linestyle=linestyle,
        ),
        reference=tagger == "dips",
    )
    plot_histo.add(
        Histogram(
            values=discs[tagger][is_b],
            bins=np.linspace(-10, 10, 50),
            label="$b$-jets" if tagger == "dips" else None,
            colour=Flavours["bjets"].colour,
            ratio_group="bjets",
            linestyle=linestyle,
        ),
        reference=tagger == "dips",
    )

plot_histo.draw()
# The lines below create a legend for the linestyles (i.e. solid lines -> DIPS, dashed
# lines -> RNNIP here). The "bbox_to_anchor" argument specifies where to place the
# linestyle legend
plot_histo.make_linestyle_legend(
    linestyles=linestyles, labels=["DIPS", "RNNIP"], bbox_to_anchor=(0.55, 1)
)
plot_histo.savefig("histogram_discriminant.png", transparent=False)

Flavour probabilities plot#

"""Example plot script for flavour probability comparison."""

from __future__ import annotations

import numpy as np

from puma import Histogram, HistogramPlot
from puma.utils import get_dummy_2_taggers

# The line below generates dummy data which is similar to a NN output
df = get_dummy_2_taggers()

# Initialise histogram plot
plot_histo = HistogramPlot(
    n_ratio_panels=0,
    ylabel="Normalised number of jets",
    xlabel="$b$-jets probability",
    logy=True,
    leg_ncol=1,
    atlas_first_tag="Simulation, $\\sqrt{s}=13$ TeV",
    atlas_second_tag="dummy sample, dummy jets",
    atlas_brand=None,  # You can deactivate the ATLAS branding (e.g. for a thesis)
    draw_errors=False,
)

# Add the ttbar histograms
u_jets = df[df["HadronConeExclTruthLabelID"] == 0]
c_jets = df[df["HadronConeExclTruthLabelID"] == 4]
b_jets = df[df["HadronConeExclTruthLabelID"] == 5]

# the "flavour" argument will add a "light-flavour jets" (or other) prefix to the label
# + set the colour to the one that is defined in puma.utils.global_config
plot_histo.add(
    Histogram(
        u_jets["dips_pb"],
        bins=np.linspace(0, 1, 30),
        flavour="ujets",
        linestyle="dashed",
    )
)
plot_histo.add(
    Histogram(
        c_jets["dips_pb"],
        bins=np.linspace(0, 1, 30),
        flavour="cjets",
        linestyle="dashdot",
    )
)
plot_histo.add(
    Histogram(
        b_jets["dips_pb"],
        bins=np.linspace(0, 1, 30),
        flavour="bjets",
    )
)

plot_histo.draw()
plot_histo.savefig("histogram_bjets_probability.png", transparent=False)

More general example#

In most cases you probably want to plot histograms with the different flavours like in the examples above. However, the puma API allows to plot any kind of data. As an example, you could also produce a MC vs data plot with the following example code:

"""Example of histogram plot that deviates from puma default plots."""

from __future__ import annotations

import numpy as np

from puma import Histogram, HistogramPlot

# Generate two distributions to plot
N_BKG = int(1e6)
N_SIG = int(2e4)
rng = np.random.default_rng(seed=42)
expectation = rng.exponential(size=N_BKG)
measurement = np.concatenate((
    rng.exponential(size=N_BKG),
    rng.normal(loc=2, scale=0.2, size=N_SIG),
))
expectation_hist = Histogram(
    expectation,
    bins=50,
    bins_range=(1.1, 4),
    norm=False,
    label="MC",
    histtype="stepfilled",
    alpha=1,
)
measurement_hist = Histogram(
    measurement,
    bins=50,
    bins_range=(1.1, 4),
    norm=False,
    label="dummy data",
)

# Initialise histogram plot
plot_histo = HistogramPlot(
    ylabel="Number of events",
    xlabel="Invariant mass $m$ [a.u.]",
    logy=False,
    atlas_first_tag="Simulation Internal",
    atlas_second_tag="Example for more general plot",
    figsize=(6, 5),
    n_ratio_panels=1,
)

# Add histograms and plot
plot_histo.add(expectation_hist, reference=True)
plot_histo.add(measurement_hist)
plot_histo.draw()

plot_histo.savefig("histogram_basic_example.png", transparent=False)

Weighted histograms#

puma also supports weighted histograms by specifying the optional argument weights. An example is given below:

"""Example script for plotting weighted histograms."""

from __future__ import annotations

import numpy as np

from puma import Histogram, HistogramPlot

rng = np.random.default_rng(seed=42)
# we define two gaussian distributions - one located at 0, one at 3
values = np.hstack((rng.normal(size=10_000), rng.normal(loc=3, size=10_000)))
# for the weighted histogram we weight entries of the right peak by a factor of 2
weights = np.hstack((np.ones(10_000), 2 * np.ones(10_000)))

hist_plot = HistogramPlot(n_ratio_panels=1)
# add the unweighted histogram
hist_plot.add(
    Histogram(
        values=values,
        bins=40,
        bins_range=(-3, 6),
        norm=False,
        label="Without weights",
    ),
    reference=True,
)
# add the weighted histogram
hist_plot.add(
    Histogram(
        values=values,
        bins=40,
        bins_range=(-3, 6),
        weights=weights,
        norm=False,
        label="Weight 2 for right peak",
    )
)
hist_plot.draw()
hist_plot.savefig("histogram_weighted.png")

Underflow/overflow bins#

Underflow and overflow bins are enabled by default, but can be deactivated using the underoverflow attribute of puma.HistogramPlot. Below an example of the same Gaussian distribution plotted with and without underflow/overflow bins.

"""Example script that demonstrates under/overflow bins."""

from __future__ import annotations

import numpy as np

from puma import Histogram, HistogramPlot

rng = np.random.default_rng(42)

vals = rng.normal(size=10_000)

plot_without = HistogramPlot()
plot_without.title = "Without underflow/overflow bins"
plot_without.add(
    Histogram(
        values=vals,
        bins=40,
        bins_range=(-2, 2),
        underoverflow=False,
        label="Gaussian($\\mu=0$, $\\sigma=1$)",
    )
)
plot_without.draw()
plot_without.savefig("hist_without_underoverflow.png")

plot_with = HistogramPlot()
plot_with.title = "With underflow/overflow bins"
plot_with.add(
    Histogram(
        values=vals,
        bins=40,
        bins_range=(-2, 2),
        label="Gaussian($\\mu=0$, $\\sigma=1$)",
    )
)
plot_with.draw()
plot_with.savefig("hist_with_underoverflow.png")

Data/MC histograms#

To visualize the agreement of the Monte-Carlo with data, puma is also able to produce so-called Data/MC histograms. They show the data as a dot histogram while the MC is still a stacked histogram. An example of this plot can be seen here:

The code to create this example can be found in the examples folder in the plot_data_mc.py. Similar to the rest of the Puma.HistogramPlot examples shown here, a lot of more optional argument can be passed.

"""Example script that demonstrates Data/MC plots."""

from __future__ import annotations

import numpy as np

from puma import Histogram, HistogramPlot

# Generate two MC contributions and data
rng = np.random.default_rng(42)
mc1 = rng.normal(size=10_000)
mc2 = rng.normal(size=20_000)
data = rng.normal(size=30_000)

# Set up the real plot
data_mc_plot = HistogramPlot(
    n_ratio_panels=1,
    stacked=True,
)

# Set the plot title
data_mc_plot.title = "Test Data/MC Plot"

# Add the different MC contributions to the plot
data_mc_plot.add(
    Histogram(
        mc1,
        bins=40,
        bins_range=[-2, 2],
        label="MC Process 1",
        norm=False,
    )
)
data_mc_plot.add(
    Histogram(
        mc2,
        bins=40,
        bins_range=[-2, 2],
        label="MC Process 2",
        norm=False,
    )
)

# Add the data
data_mc_plot.add(
    Histogram(
        data,
        bins=40,
        bins_range=[-2, 2],
        label="Data",
        is_data=True,
        colour="k",
        norm=False,
    )
)

# Draw the plot
data_mc_plot.draw()

# Add the bin width to the y-axis label
data_mc_plot.add_bin_width_to_ylabel()
data_mc_plot.savefig("data_mc_example.png")