yenniejun commited on
Commit
40b05d1
1 Parent(s): 111053f

Add barchart showing shortest/longest median tokenized text

Browse files
Files changed (1) hide show
  1. app.py +32 -2
app.py CHANGED
@@ -9,6 +9,8 @@ import seaborn as sns
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
 
 
12
  import random, glob
13
 
14
  @st.cache_data
@@ -51,7 +53,9 @@ tokenizer_names_to_test = [
51
  with st.sidebar:
52
 
53
  st.header('All languages are NOT created (tokenized) equal!')
54
- link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
 
 
55
  st.markdown(link)
56
 
57
  st.header('Data Visualization')
@@ -130,7 +134,33 @@ with st.container():
130
  )
131
  st.plotly_chart(fig, use_container_width=True)
132
 
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
 
136
 
 
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
12
+ from plotly.subplots import make_subplots
13
+ import plotly.graph_objects as go
14
  import random, glob
15
 
16
  @st.cache_data
 
53
  with st.sidebar:
54
 
55
  st.header('All languages are NOT created (tokenized) equal!')
56
+ link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese)."
57
+ st.markdown(link)
58
+ link="This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
59
  st.markdown(link)
60
 
61
  st.header('Data Visualization')
 
134
  )
135
  st.plotly_chart(fig, use_container_width=True)
136
 
137
+
138
+ # Create figures using px.bar
139
+ shortest = val_data.groupby('lang')[tokenizer_name].median().sort_values().head(7).reset_index()
140
+ shortest["type"] = "shortest"
141
+ longest = val_data.groupby('lang')[tokenizer_name].median().sort_values().tail(7).reset_index()
142
+ longest["type"] = "longest"
143
+ combined = pd.concat([shortest, longest]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
144
+ color_sequence = px.colors.qualitative.D3 # You can choose other built-in sequences or define your own
145
+ fig = px.bar(combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=color_sequence)
146
+ fig.update_traces(hovertemplate='%{y}: %{x} tokens')
147
+ fig.update_layout(
148
+ title=dict(text='Top Langs with Shortest and Longest Median Token Lengths',
149
+ font=dict(size=25), automargin=True, yref='paper', pad=dict(b=20)), # Add more padding below the title
150
+ # title='Distribution of tokens',
151
+ xaxis=dict(
152
+ title="Number of Tokens",
153
+ showgrid=True, # Show vertical gridlines
154
+ gridwidth=1, # Gridline width
155
+ gridcolor='LightGrey' # Gridline color
156
+ ),
157
+ yaxis=dict(
158
+ title="",
159
+ ),
160
+ height=400,
161
+ showlegend=False # Remove the legend
162
+ )
163
+ st.plotly_chart(fig, use_container_width=True)
164
 
165
 
166