noequal commited on
Commit
3893344
·
1 Parent(s): fd136e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -1,20 +1,21 @@
1
  import streamlit as st
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- # Load a pre-trained version of ClinicalGPT
4
- model = AutoModelForCausalLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
5
- # Tokenize your clinical text data using the AutoTokenizer class
6
- tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
7
- # Convert your tokenized data into PyTorch tensors and create a PyTorch Dataset object
8
  import torch
9
- from torch.utils.data import Dataset
10
-
11
 
 
12
  train_texts = st.text_input("Enter your clinical text data (separated by commas):")
13
  train_labels = st.text_input("Enter your corresponding labels (separated by commas):")
14
 
 
15
  train_texts = train_texts.split(",")
16
  train_labels = train_labels.split(",")
17
 
 
 
 
 
 
18
  class ClinicalDataset(Dataset):
19
  def __init__(self, texts, labels, tokenizer):
20
  self.texts = texts
@@ -29,12 +30,15 @@ class ClinicalDataset(Dataset):
29
  label = self.labels[idx]
30
  encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
31
  return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": torch.tensor(label)}
 
 
32
 
 
 
 
 
33
 
34
- dataset = ClinicalDataset(texts=train_texts, labels=train_labels, tokenizer=tokenizer)
35
- # Fine-tune the pre-trained model on your clinical dataset
36
- from transformers import Trainer, TrainingArguments
37
-
38
  training_args = TrainingArguments(
39
  output_dir='./results', # output directory
40
  num_train_epochs=3, # total number of training epochs
@@ -43,12 +47,11 @@ training_args = TrainingArguments(
43
  warmup_steps=500, # number of warmup steps for learning rate scheduler
44
  weight_decay=0.01, # strength of weight decay
45
  logging_dir='./logs', # directory for storing logs
46
- logging_steps=10, )
47
-
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
51
- train_dataset=dataset,
52
  eval_dataset=val_dataset,
53
  data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
54
  'attention_mask': torch.stack([f['attention_mask'] for f in data]),
 
1
  import streamlit as st
 
 
 
 
 
 
2
  import torch
3
+ from torch.utils.data import Dataset, random_split
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
5
 
6
+ # Prompt user to enter clinical text data and corresponding labels
7
  train_texts = st.text_input("Enter your clinical text data (separated by commas):")
8
  train_labels = st.text_input("Enter your corresponding labels (separated by commas):")
9
 
10
+ # Convert comma-separated values into lists
11
  train_texts = train_texts.split(",")
12
  train_labels = train_labels.split(",")
13
 
14
+ # Load pre-trained model and tokenizer
15
+ model = AutoModelForCausalLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
16
+ tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
17
+
18
+ # Create PyTorch Dataset object
19
  class ClinicalDataset(Dataset):
20
  def __init__(self, texts, labels, tokenizer):
21
  self.texts = texts
 
30
  label = self.labels[idx]
31
  encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
32
  return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": torch.tensor(label)}
33
+
34
+ dataset = ClinicalDataset(texts=train_texts, labels=train_labels, tokenizer=tokenizer)
35
 
36
+ # Split dataset into training and validation sets
37
+ train_size = int(0.8 * len(dataset))
38
+ val_size = len(dataset) - train_size
39
+ train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
40
 
41
+ # Fine-tune pre-trained model on clinical dataset
 
 
 
42
  training_args = TrainingArguments(
43
  output_dir='./results', # output directory
44
  num_train_epochs=3, # total number of training epochs
 
47
  warmup_steps=500, # number of warmup steps for learning rate scheduler
48
  weight_decay=0.01, # strength of weight decay
49
  logging_dir='./logs', # directory for storing logs
50
+ logging_steps=10,)
 
51
  trainer = Trainer(
52
  model=model,
53
  args=training_args,
54
+ train_dataset=train_dataset,
55
  eval_dataset=val_dataset,
56
  data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
57
  'attention_mask': torch.stack([f['attention_mask'] for f in data]),