Spaces:

ccss17
/

dga-detector

Sleeping

ccss17 commited on Oct 1

Commit

5d1d43b

1 Parent(s): 3964bc6

Fix: Include custom model code for HF Spaces deployment

- Add model.py with DGAEncoderForSequenceClassification
- Add charset.py with encoding utilities
- Update app.py to import local model instead of AutoModel
- This fixes the 'model type dga_encoder not recognized' error

Files changed (4) hide show

README.md +2 -2
app.py +6 -29
charset.py +41 -0
model.py +152 -0

README.md CHANGED Viewed

@@ -46,12 +46,12 @@ Domain Generation Algorithms (DGAs) are used by malware to generate pseudo-rando
 ## Technical Details
-- **Architecture**: Custom Transformer Encoder (4 layers, 256 dim, 4 heads)
 - **Parameters**: 3.2M
 - **Training Data**: ExtraHop DGA dataset (500K samples)
 - **Framework**: PyTorch + HuggingFace Transformers
 ---
 **Built with ❤️ using PyTorch, HuggingFace, and Gradio**
-# Force rebuild

 ## Technical Details
+- **Architecture**: Custom Transformer Encoder (4 layers, 256 dim, 8 heads)
 - **Parameters**: 3.2M
 - **Training Data**: ExtraHop DGA dataset (500K samples)
 - **Framework**: PyTorch + HuggingFace Transformers
+- **Model Files**: This Space includes the custom model code (`model.py`, `charset.py`) to enable loading the custom architecture
 ---
 **Built with ❤️ using PyTorch, HuggingFace, and Gradio**

app.py CHANGED Viewed

@@ -6,37 +6,16 @@ interface for classifying domains as legitimate or DGA-generated.
 import torch
 import gradio as gr
-from transformers import AutoModelForSequenceClassification
-# Character encoding (matches training charset)
-CHARSET = "abcdefghijklmnopqrstuvwxyz0123456789-."
-CHAR_TO_IDX = {c: i + 1 for i, c in enumerate(CHARSET)}
-PAD = 0
-def encode_domain(domain: str, max_len: int = 64):
-    """Encode domain string to token IDs.
-    Args:
-        domain: Domain name to encode
-        max_len: Maximum length (padding/truncation)
-    Returns:
-        List of token IDs
-    """
-    ids = [CHAR_TO_IDX.get(c, PAD) for c in domain.lower()]
-    ids = ids[:max_len]
-    ids = ids + [PAD] * (max_len - len(ids))
-    return ids
 # Load model from HuggingFace Hub
-MODEL_NAME = "ccss17/dga-transformer-encoder"  # Replace with your model
 print(f"Loading model from {MODEL_NAME}...")
-model = AutoModelForSequenceClassification.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True,  # Required for custom model architecture
-)
 model.eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -59,9 +38,7 @@ def predict_domain(domain: str):
     domain = domain.strip().lower()
     # Encode domain to token IDs
-    input_ids = torch.tensor(
-        [encode_domain(domain, max_len=64)], device=device
-    )
     # Get model prediction
     with torch.no_grad():

 import torch
 import gradio as gr
+# Import custom model and encoding
+from model import DGAEncoderForSequenceClassification
+from charset import encode_domain
 # Load model from HuggingFace Hub
+MODEL_NAME = "ccss17/dga-transformer-encoder"
 print(f"Loading model from {MODEL_NAME}...")
+model = DGAEncoderForSequenceClassification.from_pretrained(MODEL_NAME)
 model.eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     domain = domain.strip().lower()
     # Encode domain to token IDs
+    input_ids = torch.tensor([encode_domain(domain, max_len=64)], device=device)
     # Get model prediction
     with torch.no_grad():

charset.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import string
+# Character-level tokenizer for registrable domain labels (no TLD).
+# Allowed set covers LDH + underscore/specials rarely seen; unknown chars map to PAD.
+CHARS = list(string.ascii_lowercase + string.digits + "-_")
+SPECIAL_TOKENS = ("<pad>", "<cls>")
+PAD, CLS = range(len(SPECIAL_TOKENS))
+SPECIAL_OFFSET = len(SPECIAL_TOKENS)
+stoi = {c: i + SPECIAL_OFFSET for i, c in enumerate(CHARS)}  # reserve space for PAD/CLS
+itos = {i: c for c, i in stoi.items()}
+VOCAB_SIZE = SPECIAL_OFFSET + len(CHARS)  # include special tokens
+ALLOWED_CHARS = set(stoi)
+def normalize_domain(d: str) -> str:
+    """Normalize a raw domain label into charset-safe characters.
+    Example: normalize_domain("Example_Domain!") -> "example_domain"
+    Concept: basic text preprocessing step before tokenization.
+    """
+    d = (d or "").strip().lower()
+    return "".join(ch for ch in d if ch in ALLOWED_CHARS)
+def encode_domain(d: str, max_len: int = 64):
+    """Convert a domain label into fixed-length token ids.
+    Example: encode_domain("abc", max_len=5) -> [1, 2, 3, 4, 0]
+    Example (truncate): encode_domain("abcdef", max_len=4) -> [1, 2, 3, 4]
+    Concept: character tokenization with padding (sequence modelling input).
+    Note: the leading CLS token (id=1) is reserved as a sequence summary token,
+    mirroring the common Transformer practice of reading the first position to
+    represent the entire input for classification.
+    """
+    d = normalize_domain(d)
+    ids = [CLS] + [stoi.get(ch, PAD) for ch in d][: max_len - 1]
+    if len(ids) < max_len:
+        ids += [PAD] * (max_len - len(ids))
+    return ids

model.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""DGA Detection Model using Transformer Encoder.
+This model treats domain names as sequences of characters and uses a Transformer
+encoder to learn patterns that distinguish DGA (algorithmically generated) domains
+from legitimate ones.
+"""
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import SequenceClassifierOutput
+from charset import PAD, VOCAB_SIZE
+NUM_CLASSES = 2
+class DGAEncoder(nn.Module):
+    """Transformer encoder for DGA (Domain Generation Algorithm) detection."""
+    def __init__(
+        self,
+        *,
+        vocab_size: int,
+        max_len: int = 64,
+        d_model: int = 256,
+        nhead: int = 8,
+        num_layers: int = 4,
+        dropout: float = 0.1,
+        ffn_mult: int = 4,
+    ) -> None:
+        super().__init__()
+        self.tok = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
+        self.pos = nn.Embedding(max_len, d_model)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(max_len).unsqueeze(0),
+            persistent=False,
+        )
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=ffn_mult * d_model,
+            dropout=dropout,
+            batch_first=True,
+            norm_first=True,
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
+        self.norm = nn.LayerNorm(d_model)
+        self.clf = nn.Linear(d_model, NUM_CLASSES)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the encoder."""
+        b, L = x.shape
+        pos = self.position_ids[:, :L].expand(b, L)
+        h = self.tok(x) + self.pos(pos)
+        h = self.enc(h)
+        cls = self.norm(h[:, 0])
+        return self.clf(cls)
+class DGAEncoderConfig(PretrainedConfig):
+    """Configuration for DGAEncoder compatible with HuggingFace Transformers."""
+    model_type = "dga_encoder"
+    def __init__(
+        self,
+        vocab_size: int = VOCAB_SIZE,
+        max_len: int = 64,
+        d_model: int = 256,
+        nhead: int = 8,
+        num_layers: int = 4,
+        dropout: float = 0.1,
+        ffn_mult: int = 4,
+        num_labels: int = 2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.max_len = max_len
+        self.d_model = d_model
+        self.nhead = nhead
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.ffn_mult = ffn_mult
+        self.num_labels = num_labels
+class DGAEncoderForSequenceClassification(PreTrainedModel):
+    """HuggingFace-compatible wrapper around DGAEncoder."""
+    config_class = DGAEncoderConfig
+    def __init__(self, config: DGAEncoderConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DGAEncoder(
+            vocab_size=config.vocab_size,
+            max_len=config.max_len,
+            d_model=config.d_model,
+            nhead=config.nhead,
+            num_layers=config.num_layers,
+            dropout=config.dropout,
+            ffn_mult=config.ffn_mult,
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ):
+        """Forward pass compatible with HF Trainer."""
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
+        )
+        logits = self.encoder(input_ids)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(-1, self.config.num_labels), labels.view(-1)
+            )
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=None,
+            attentions=None,
+        )