Spaces:
Sleeping
Sleeping
Commit
·
b2c2c22
1
Parent(s):
b1bc515
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,23 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
import pandas as pd
|
| 3 |
import streamlit as st
|
|
|
|
|
|
|
| 4 |
|
| 5 |
st.set_page_config(layout="wide")
|
| 6 |
|
| 7 |
with st.sidebar:
|
| 8 |
subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
st.dataframe(pd.DataFrame(dataset[subset]))
|
| 12 |
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
import pandas as pd
|
| 3 |
import streamlit as st
|
| 4 |
+
from transformers import AutoTokenizer
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
|
| 7 |
st.set_page_config(layout="wide")
|
| 8 |
|
| 9 |
with st.sidebar:
|
| 10 |
subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))
|
| 11 |
|
| 12 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
| 13 |
+
flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
|
| 14 |
+
dataset = flores[subset]
|
| 15 |
+
|
| 16 |
+
fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,5))
|
| 17 |
+
axl.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_eng_Latn'])['input_ids'])})['num_tokens'])
|
| 18 |
+
axl.set_title('eng mistral tokens')
|
| 19 |
+
axr.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens'])
|
| 20 |
+
axr.set_title('ukr mistral tokens')
|
| 21 |
+
|
| 22 |
st.dataframe(pd.DataFrame(dataset[subset]))
|
| 23 |
|