# modeling_phobert_attn.py
import torch
import torch.nn as nn
from transformers import AutoModel

class PhoBERT_Attention(nn.Module):
    def __init__(self, num_classes=2, dropout=0.3):
        super().__init__()
        self.xlm_roberta = AutoModel.from_pretrained("vinai/phobert-base")
        hidden = self.xlm_roberta.config.hidden_size
        self.attention = nn.Linear(hidden, 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden, num_classes)

    def forward(self, input_ids, attention_mask):
        out = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        H = out.last_hidden_state           # [B, T, H]
        attn = torch.softmax(self.attention(H), dim=1)  # [B, T, 1]
        ctx  = (attn * H).sum(dim=1)        # [B, H]
        logits = self.fc(self.dropout(ctx)) # [B, C]
        return logits