[#5] literal2idiomatic:d-1-3 done (annotating with special tokens). Some of the data however are erroneous
Browse files- config.yaml +5 -3
- explore/explore_fetch_pie_annotate.py +14 -0
- explore/explore_list_index.py +13 -0
- idiomify/preprocess.py +31 -0
- main_upload_literal2idiomatic.py +2 -1
config.yaml
CHANGED
|
@@ -15,7 +15,9 @@ idioms:
|
|
| 15 |
ver: d-1-2
|
| 16 |
description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
|
| 17 |
literal2idiomatic:
|
| 18 |
-
ver: d-1-
|
| 19 |
-
description:
|
| 20 |
train_ratio: 0.8
|
| 21 |
-
seed: 104
|
|
|
|
|
|
|
|
|
| 15 |
ver: d-1-2
|
| 16 |
description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
|
| 17 |
literal2idiomatic:
|
| 18 |
+
ver: d-1-3
|
| 19 |
+
description: The idioms are annotated with <idiom> & </idiom>.
|
| 20 |
train_ratio: 0.8
|
| 21 |
+
seed: 104
|
| 22 |
+
boi_token: <idiom>
|
| 23 |
+
eoi_token: </idiom>
|
explore/explore_fetch_pie_annotate.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from idiomify.fetchers import fetch_pie
|
| 3 |
+
from preprocess import annotate
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
pie_df = fetch_pie()
|
| 8 |
+
pie_df = pie_df.pipe(annotate, boi_token="<idiom>", eoi_token="</idiom>")
|
| 9 |
+
for _, row in pie_df.iterrows():
|
| 10 |
+
print(row['Idiomatic_Sent'])
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
if __name__ == '__main__':
|
| 14 |
+
main()
|
explore/explore_list_index.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
def main():
|
| 4 |
+
labels = ["O", "O", "B", "O", "I", "I" "O", "I", "O", "O"]
|
| 5 |
+
boi_idx = labels.index("B")
|
| 6 |
+
eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
|
| 7 |
+
print(boi_idx, eoi_idx)
|
| 8 |
+
print(labels[boi_idx])
|
| 9 |
+
print(labels[eoi_idx])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if __name__ == '__main__':
|
| 13 |
+
main()
|
idiomify/preprocess.py
CHANGED
|
@@ -17,6 +17,36 @@ def cleanse(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 17 |
return df
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 21 |
"""
|
| 22 |
stratified-split the given df into two df's.
|
|
@@ -29,3 +59,4 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
|
|
| 29 |
test_size=other_size, random_state=seed,
|
| 30 |
shuffle=True)
|
| 31 |
return ratio_df, other_df
|
|
|
|
|
|
| 17 |
return df
|
| 18 |
|
| 19 |
|
| 20 |
+
def annotate(df: pd.DataFrame, boi_token: str, eoi_token: str) -> pd.DataFrame:
|
| 21 |
+
"""
|
| 22 |
+
e.g.
|
| 23 |
+
given a row like this:
|
| 24 |
+
Idiom keep an eye on
|
| 25 |
+
Sense keep a watch on something or someone closely
|
| 26 |
+
Idiomatic_Sent He had put on a lot of weight lately , so he started keeping an eye on what he ate .
|
| 27 |
+
Literal_Sent He had put on a lot of weight lately , so he started to watch what he ate .
|
| 28 |
+
Idiomatic_Label O O O O O O O O O O O O O B I I O O O O O
|
| 29 |
+
Literal_Label O O O O O O O O O O O O O B I O O O O
|
| 30 |
+
|
| 31 |
+
use Idiomatic_Label to replace Idiomatic_Sent with:
|
| 32 |
+
He had put on a lot of weight lately , so he started <idiom> keeping an eye on </idiom> what he ate .
|
| 33 |
+
"""
|
| 34 |
+
for idx, row in df.iterrows():
|
| 35 |
+
tokens = row['Idiomatic_Sent'].split(" ")
|
| 36 |
+
labels = row["Idiomatic_Label"].split(" ")
|
| 37 |
+
if "B" in labels:
|
| 38 |
+
boi_idx = labels.index("B")
|
| 39 |
+
if "I" in labels:
|
| 40 |
+
eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
|
| 41 |
+
tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]}"
|
| 42 |
+
tokens[eoi_idx] = f"{tokens[eoi_idx]} {eoi_token}"
|
| 43 |
+
else:
|
| 44 |
+
tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]} {eoi_token}"
|
| 45 |
+
row['Idiomatic_Sent'] = " ".join(tokens)
|
| 46 |
+
|
| 47 |
+
return df
|
| 48 |
+
|
| 49 |
+
|
| 50 |
def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 51 |
"""
|
| 52 |
stratified-split the given df into two df's.
|
|
|
|
| 59 |
test_size=other_size, random_state=seed,
|
| 60 |
shuffle=True)
|
| 61 |
return ratio_df, other_df
|
| 62 |
+
|
main_upload_literal2idiomatic.py
CHANGED
|
@@ -4,7 +4,7 @@ literal2idiomatic ver: d-1-2
|
|
| 4 |
import os
|
| 5 |
from idiomify.paths import ROOT_DIR
|
| 6 |
from idiomify.fetchers import fetch_pie, fetch_config
|
| 7 |
-
from idiomify.preprocess import upsample, cleanse, stratified_split
|
| 8 |
import wandb
|
| 9 |
|
| 10 |
|
|
@@ -15,6 +15,7 @@ def main():
|
|
| 15 |
config = fetch_config()['literal2idiomatic']
|
| 16 |
train_df, test_df = pie_df.pipe(cleanse)\
|
| 17 |
.pipe(upsample, seed=config['seed'])\
|
|
|
|
| 18 |
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
|
| 19 |
# why don't you just "select" the columns? yeah, stop using csv library. just select them.
|
| 20 |
train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
|
|
|
|
| 4 |
import os
|
| 5 |
from idiomify.paths import ROOT_DIR
|
| 6 |
from idiomify.fetchers import fetch_pie, fetch_config
|
| 7 |
+
from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
|
| 8 |
import wandb
|
| 9 |
|
| 10 |
|
|
|
|
| 15 |
config = fetch_config()['literal2idiomatic']
|
| 16 |
train_df, test_df = pie_df.pipe(cleanse)\
|
| 17 |
.pipe(upsample, seed=config['seed'])\
|
| 18 |
+
.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
|
| 19 |
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
|
| 20 |
# why don't you just "select" the columns? yeah, stop using csv library. just select them.
|
| 21 |
train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
|