joaogante's picture
joaogante HF staff
rearrange layout for blog post
a39a042
import matplotlib
matplotlib.use('Agg')
import functools
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
FIGURE_PATH = "plt.png"
FIG_DPI = 300
def get_plot(task, gpu, omit_offload):
# slice the dataframe according to the inputs
df = pd.read_csv("data.csv")
df = df[df["task"] == task]
df = df[df["gpu"] == gpu]
if omit_offload == "Yes":
df = df[df["offload"] == 0]
# combine model name and dtype
df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ')
# fuse the two columns to be compared (original and assisted generation)
df = df.melt(
id_vars=["task", "gpu", "model and dtype", "offload"],
value_vars=["Greedy", "Assisted"],
var_name="generation_type",
value_name="generation_time",
)
g = sns.catplot(
data=df,
kind="bar",
x="model and dtype",
y="generation_time",
hue="generation_type",
palette={"Greedy": "blue", "Assisted": "orange"},
alpha=.9,
)
g.despine(left=True)
g.set_axis_labels("Model size and dtype", "Latency (ms/token)")
g.set_xticklabels(fontsize=7)
g.set_yticklabels(fontsize=7)
g.legend.set_title("Generation Type")
plt.setp(g._legend.get_texts(), fontsize='7') # for legend text
# Add the number to the top of each bar
ax = g.facet_axis(0, 0)
for i in ax.containers:
ax.bar_label(i, fontsize=7)
plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
return FIGURE_PATH
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
# Assisted Generation Benchmark
"""
)
# components shared across tabs
omit_offload_fn = functools.partial(
gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True
)
def gpu_selector_fn(gpu_list):
return gr.Dropdown(
gpu_list, value=gpu_list[-1], label="GPU", interactive=True
)
with gr.Tabs():
with gr.TabItem("OPT: Open"):
plot_fn = functools.partial(get_plot, "OPT: Open Text Generation")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- C4 (en, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("OPT: Summ"):
plot_fn = functools.partial(get_plot, "OPT: Summarization")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Whisper: ARS"):
plot_fn = functools.partial(get_plot, "Whisper: ARS")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("T4", "No"))
gr.Markdown(
"""
### Assistant Model
- `openai/whisper-tiny`
### Model Names:
- large-v2: `openai/whisper-large-v2`
### Dataset used as input prompt:
- Librispeech ARS (clean, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("CodeGen: Code"):
plot_fn = functools.partial(get_plot, "CodeGen: Code Generation")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `Salesforce/codegen-350M-mono`
### Model Names:
- 2B: `Salesforce/codegen-2B-mono`
- 6B: `Salesforce/codegen-6B-mono`
- 16B: `Salesforce/codegen-16B-mono`
### Dataset used as input prompt:
- The Stack (python)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Flan-T5: Summ"):
plot_fn = functools.partial(get_plot, "Flan-T5: Summarization")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `google/flan-t5-small`
### Model Names:
- large: `google/flan-t5-large`
- xl: `google/flan-t5-xl`
- xxl: `google/flan-t5-xxl`
- ul2: `google/flan-ul2`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Benchmark Info"):
gr.Dataframe(
headers=["Parameter", "Value"],
value=[
["Transformers Version", "4.29dev0"],
["Pytorch Version", "2.0.0"],
["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
["CUDA", "11.8 (3090) / 11.3 (others GPUs)"],
["Number of input samples", "20-100 (depending on the model size)"],
["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"],
],
)
demo.launch()