[#9] main_deploy.py: removed authors & versions. config.yaml: incresed epochs with smaller lr. fetchers.py: typo fixed
Browse files- config.yaml +2 -2
- idiomify/fetchers.py +5 -5
- idiomify/models.py +0 -1
- idiomify/paths.py +0 -1
- idiomify/preprocess.py +0 -1
- main_deploy.py +2 -6
config.yaml
CHANGED
|
@@ -3,11 +3,11 @@ idiomifier:
|
|
| 3 |
ver: m-1-3
|
| 4 |
desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
|
| 5 |
bart: facebook/bart-base
|
| 6 |
-
lr: 0.
|
| 7 |
literal2idiomatic_ver: d-1-3
|
| 8 |
idioms_ver: d-1-3
|
| 9 |
tokenizer_ver: t-1-1
|
| 10 |
-
max_epochs:
|
| 11 |
batch_size: 40
|
| 12 |
shuffle: true
|
| 13 |
seed: 104
|
|
|
|
| 3 |
ver: m-1-3
|
| 4 |
desc: Just overfitting on PIE dataset, but now with <idiom> & </idiom> special tokens.
|
| 5 |
bart: facebook/bart-base
|
| 6 |
+
lr: 0.00005
|
| 7 |
literal2idiomatic_ver: d-1-3
|
| 8 |
idioms_ver: d-1-3
|
| 9 |
tokenizer_ver: t-1-1
|
| 10 |
+
max_epochs: 8
|
| 11 |
batch_size: 40
|
| 12 |
shuffle: true
|
| 13 |
seed: 104
|
idiomify/fetchers.py
CHANGED
|
@@ -27,7 +27,7 @@ def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
|
|
| 27 |
artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
|
| 28 |
else:
|
| 29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
| 30 |
-
artifact_dir = artifact.download(root=idioms_dir(ver))
|
| 31 |
tsv_path = path.join(artifact_dir, "all.tsv")
|
| 32 |
return pd.read_csv(tsv_path, sep="\t")
|
| 33 |
|
|
@@ -39,7 +39,7 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd
|
|
| 39 |
artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
|
| 40 |
else:
|
| 41 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
|
| 42 |
-
artifact_dir = artifact.download(root=literal2idiomatic(ver))
|
| 43 |
train_path = path.join(artifact_dir, "train.tsv")
|
| 44 |
test_path = path.join(artifact_dir, "test.tsv")
|
| 45 |
train_df = pd.read_csv(train_path, sep="\t")
|
|
@@ -57,10 +57,10 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
|
|
| 57 |
else:
|
| 58 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
|
| 59 |
config = artifact.metadata
|
| 60 |
-
artifact_dir = artifact.download(root=idiomifier_dir(ver))
|
| 61 |
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
| 62 |
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
| 63 |
-
bart.
|
| 64 |
model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
|
| 65 |
return model
|
| 66 |
|
|
@@ -70,7 +70,7 @@ def fetch_tokenizer(ver: str, run: Run = None) -> BartTokenizer:
|
|
| 70 |
artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
|
| 71 |
else:
|
| 72 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
|
| 73 |
-
artifact_dir = artifact.download(root=tokenizer_dir(ver))
|
| 74 |
tokenizer = BartTokenizer.from_pretrained(artifact_dir)
|
| 75 |
return tokenizer
|
| 76 |
|
|
|
|
| 27 |
artifact = run.use_artifact(f"idioms:{ver}", type="dataset")
|
| 28 |
else:
|
| 29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
| 30 |
+
artifact_dir = artifact.download(root=str(idioms_dir(ver)))
|
| 31 |
tsv_path = path.join(artifact_dir, "all.tsv")
|
| 32 |
return pd.read_csv(tsv_path, sep="\t")
|
| 33 |
|
|
|
|
| 39 |
artifact = run.use_artifact(f"literal2idiomatic:{ver}", type="dataset")
|
| 40 |
else:
|
| 41 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
|
| 42 |
+
artifact_dir = artifact.download(root=str(literal2idiomatic(ver)))
|
| 43 |
train_path = path.join(artifact_dir, "train.tsv")
|
| 44 |
test_path = path.join(artifact_dir, "test.tsv")
|
| 45 |
train_df = pd.read_csv(train_path, sep="\t")
|
|
|
|
| 57 |
else:
|
| 58 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
|
| 59 |
config = artifact.metadata
|
| 60 |
+
artifact_dir = artifact.download(root=str(idiomifier_dir(ver)))
|
| 61 |
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
| 62 |
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
| 63 |
+
bart.resize_token_embeddings(config['vocab_size'])
|
| 64 |
model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
|
| 65 |
return model
|
| 66 |
|
|
|
|
| 70 |
artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
|
| 71 |
else:
|
| 72 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
|
| 73 |
+
artifact_dir = artifact.download(root=str(tokenizer_dir(ver)))
|
| 74 |
tokenizer = BartTokenizer.from_pretrained(artifact_dir)
|
| 75 |
return tokenizer
|
| 76 |
|
idiomify/models.py
CHANGED
|
@@ -71,4 +71,3 @@ class Idiomifier(pl.LightningModule): # noqa
|
|
| 71 |
"""
|
| 72 |
# The authors used Adam, so we might as well use it as well.
|
| 73 |
return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
|
| 74 |
-
|
|
|
|
| 71 |
"""
|
| 72 |
# The authors used Adam, so we might as well use it as well.
|
| 73 |
return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
|
|
|
idiomify/paths.py
CHANGED
|
@@ -19,4 +19,3 @@ def idiomifier_dir(ver: str) -> Path:
|
|
| 19 |
|
| 20 |
def tokenizer_dir(ver: str) -> Path:
|
| 21 |
return ARTIFACTS_DIR / f"tokenizer_{ver}"
|
| 22 |
-
|
|
|
|
| 19 |
|
| 20 |
def tokenizer_dir(ver: str) -> Path:
|
| 21 |
return ARTIFACTS_DIR / f"tokenizer_{ver}"
|
|
|
idiomify/preprocess.py
CHANGED
|
@@ -59,4 +59,3 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
|
|
| 59 |
test_size=other_size, random_state=seed,
|
| 60 |
shuffle=True)
|
| 61 |
return ratio_df, other_df
|
| 62 |
-
|
|
|
|
| 59 |
test_size=other_size, random_state=seed,
|
| 60 |
shuffle=True)
|
| 61 |
return ratio_df, other_df
|
|
|
main_deploy.py
CHANGED
|
@@ -1,20 +1,18 @@
|
|
| 1 |
"""
|
| 2 |
we deploy the pipeline via streamlit.
|
| 3 |
"""
|
| 4 |
-
from typing import Tuple, List
|
| 5 |
import streamlit as st
|
| 6 |
from transformers import BartTokenizer
|
| 7 |
from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
|
| 8 |
from idiomify.pipeline import Pipeline
|
| 9 |
-
from idiomify.models import Idiomifier
|
| 10 |
|
| 11 |
|
| 12 |
@st.cache(allow_output_mutation=True)
|
| 13 |
-
def fetch_resources() ->
|
| 14 |
config = fetch_config()['idiomifier']
|
| 15 |
model = fetch_idiomifier(config['ver'])
|
| 16 |
-
idioms = fetch_idioms(config['idioms_ver'])
|
| 17 |
tokenizer = BartTokenizer.from_pretrained(config['bart'])
|
|
|
|
| 18 |
return config, model, tokenizer, idioms
|
| 19 |
|
| 20 |
|
|
@@ -24,8 +22,6 @@ def main():
|
|
| 24 |
model.eval()
|
| 25 |
pipeline = Pipeline(model, tokenizer)
|
| 26 |
st.title("Idiomify Demo")
|
| 27 |
-
st.markdown(f"Author: `Eu-Bin KIM`")
|
| 28 |
-
st.markdown(f"Version: `{config['ver']}`")
|
| 29 |
text = st.text_area("Type sentences here",
|
| 30 |
value="Just remember there will always be a hope even when things look black")
|
| 31 |
with st.sidebar:
|
|
|
|
| 1 |
"""
|
| 2 |
we deploy the pipeline via streamlit.
|
| 3 |
"""
|
|
|
|
| 4 |
import streamlit as st
|
| 5 |
from transformers import BartTokenizer
|
| 6 |
from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
|
| 7 |
from idiomify.pipeline import Pipeline
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
@st.cache(allow_output_mutation=True)
|
| 11 |
+
def fetch_resources() -> tuple:
|
| 12 |
config = fetch_config()['idiomifier']
|
| 13 |
model = fetch_idiomifier(config['ver'])
|
|
|
|
| 14 |
tokenizer = BartTokenizer.from_pretrained(config['bart'])
|
| 15 |
+
idioms = fetch_idioms(config['idioms_ver'])
|
| 16 |
return config, model, tokenizer, idioms
|
| 17 |
|
| 18 |
|
|
|
|
| 22 |
model.eval()
|
| 23 |
pipeline = Pipeline(model, tokenizer)
|
| 24 |
st.title("Idiomify Demo")
|
|
|
|
|
|
|
| 25 |
text = st.text_area("Type sentences here",
|
| 26 |
value="Just remember there will always be a hope even when things look black")
|
| 27 |
with st.sidebar:
|