ccss17 commited on
Commit
5d1d43b
·
1 Parent(s): 3964bc6

Fix: Include custom model code for HF Spaces deployment

Browse files

- Add model.py with DGAEncoderForSequenceClassification
- Add charset.py with encoding utilities
- Update app.py to import local model instead of AutoModel
- This fixes the 'model type dga_encoder not recognized' error

Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +6 -29
  3. charset.py +41 -0
  4. model.py +152 -0
README.md CHANGED
@@ -46,12 +46,12 @@ Domain Generation Algorithms (DGAs) are used by malware to generate pseudo-rando
46
 
47
  ## Technical Details
48
 
49
- - **Architecture**: Custom Transformer Encoder (4 layers, 256 dim, 4 heads)
50
  - **Parameters**: 3.2M
51
  - **Training Data**: ExtraHop DGA dataset (500K samples)
52
  - **Framework**: PyTorch + HuggingFace Transformers
 
53
 
54
  ---
55
 
56
  **Built with ❤️ using PyTorch, HuggingFace, and Gradio**
57
- # Force rebuild
 
46
 
47
  ## Technical Details
48
 
49
+ - **Architecture**: Custom Transformer Encoder (4 layers, 256 dim, 8 heads)
50
  - **Parameters**: 3.2M
51
  - **Training Data**: ExtraHop DGA dataset (500K samples)
52
  - **Framework**: PyTorch + HuggingFace Transformers
53
+ - **Model Files**: This Space includes the custom model code (`model.py`, `charset.py`) to enable loading the custom architecture
54
 
55
  ---
56
 
57
  **Built with ❤️ using PyTorch, HuggingFace, and Gradio**
 
app.py CHANGED
@@ -6,37 +6,16 @@ interface for classifying domains as legitimate or DGA-generated.
6
 
7
  import torch
8
  import gradio as gr
9
- from transformers import AutoModelForSequenceClassification
10
 
11
- # Character encoding (matches training charset)
12
- CHARSET = "abcdefghijklmnopqrstuvwxyz0123456789-."
13
- CHAR_TO_IDX = {c: i + 1 for i, c in enumerate(CHARSET)}
14
- PAD = 0
15
-
16
-
17
- def encode_domain(domain: str, max_len: int = 64):
18
- """Encode domain string to token IDs.
19
-
20
- Args:
21
- domain: Domain name to encode
22
- max_len: Maximum length (padding/truncation)
23
-
24
- Returns:
25
- List of token IDs
26
- """
27
- ids = [CHAR_TO_IDX.get(c, PAD) for c in domain.lower()]
28
- ids = ids[:max_len]
29
- ids = ids + [PAD] * (max_len - len(ids))
30
- return ids
31
 
32
 
33
  # Load model from HuggingFace Hub
34
- MODEL_NAME = "ccss17/dga-transformer-encoder" # Replace with your model
35
  print(f"Loading model from {MODEL_NAME}...")
36
- model = AutoModelForSequenceClassification.from_pretrained(
37
- MODEL_NAME,
38
- trust_remote_code=True, # Required for custom model architecture
39
- )
40
  model.eval()
41
 
42
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -59,9 +38,7 @@ def predict_domain(domain: str):
59
  domain = domain.strip().lower()
60
 
61
  # Encode domain to token IDs
62
- input_ids = torch.tensor(
63
- [encode_domain(domain, max_len=64)], device=device
64
- )
65
 
66
  # Get model prediction
67
  with torch.no_grad():
 
6
 
7
  import torch
8
  import gradio as gr
 
9
 
10
+ # Import custom model and encoding
11
+ from model import DGAEncoderForSequenceClassification
12
+ from charset import encode_domain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  # Load model from HuggingFace Hub
16
+ MODEL_NAME = "ccss17/dga-transformer-encoder"
17
  print(f"Loading model from {MODEL_NAME}...")
18
+ model = DGAEncoderForSequenceClassification.from_pretrained(MODEL_NAME)
 
 
 
19
  model.eval()
20
 
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
38
  domain = domain.strip().lower()
39
 
40
  # Encode domain to token IDs
41
+ input_ids = torch.tensor([encode_domain(domain, max_len=64)], device=device)
 
 
42
 
43
  # Get model prediction
44
  with torch.no_grad():
charset.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ # Character-level tokenizer for registrable domain labels (no TLD).
4
+ # Allowed set covers LDH + underscore/specials rarely seen; unknown chars map to PAD.
5
+
6
+ CHARS = list(string.ascii_lowercase + string.digits + "-_")
7
+
8
+ SPECIAL_TOKENS = ("<pad>", "<cls>")
9
+ PAD, CLS = range(len(SPECIAL_TOKENS))
10
+ SPECIAL_OFFSET = len(SPECIAL_TOKENS)
11
+
12
+ stoi = {c: i + SPECIAL_OFFSET for i, c in enumerate(CHARS)} # reserve space for PAD/CLS
13
+ itos = {i: c for c, i in stoi.items()}
14
+ VOCAB_SIZE = SPECIAL_OFFSET + len(CHARS) # include special tokens
15
+ ALLOWED_CHARS = set(stoi)
16
+
17
+ def normalize_domain(d: str) -> str:
18
+ """Normalize a raw domain label into charset-safe characters.
19
+
20
+ Example: normalize_domain("Example_Domain!") -> "example_domain"
21
+ Concept: basic text preprocessing step before tokenization.
22
+ """
23
+ d = (d or "").strip().lower()
24
+ return "".join(ch for ch in d if ch in ALLOWED_CHARS)
25
+
26
+ def encode_domain(d: str, max_len: int = 64):
27
+ """Convert a domain label into fixed-length token ids.
28
+
29
+ Example: encode_domain("abc", max_len=5) -> [1, 2, 3, 4, 0]
30
+ Example (truncate): encode_domain("abcdef", max_len=4) -> [1, 2, 3, 4]
31
+ Concept: character tokenization with padding (sequence modelling input).
32
+
33
+ Note: the leading CLS token (id=1) is reserved as a sequence summary token,
34
+ mirroring the common Transformer practice of reading the first position to
35
+ represent the entire input for classification.
36
+ """
37
+ d = normalize_domain(d)
38
+ ids = [CLS] + [stoi.get(ch, PAD) for ch in d][: max_len - 1]
39
+ if len(ids) < max_len:
40
+ ids += [PAD] * (max_len - len(ids))
41
+ return ids
model.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DGA Detection Model using Transformer Encoder.
2
+
3
+ This model treats domain names as sequences of characters and uses a Transformer
4
+ encoder to learn patterns that distinguish DGA (algorithmically generated) domains
5
+ from legitimate ones.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Optional
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+ from transformers import PreTrainedModel, PretrainedConfig
15
+ from transformers.modeling_outputs import SequenceClassifierOutput
16
+
17
+ from charset import PAD, VOCAB_SIZE
18
+
19
+ NUM_CLASSES = 2
20
+
21
+
22
+ class DGAEncoder(nn.Module):
23
+ """Transformer encoder for DGA (Domain Generation Algorithm) detection."""
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ vocab_size: int,
29
+ max_len: int = 64,
30
+ d_model: int = 256,
31
+ nhead: int = 8,
32
+ num_layers: int = 4,
33
+ dropout: float = 0.1,
34
+ ffn_mult: int = 4,
35
+ ) -> None:
36
+ super().__init__()
37
+
38
+ self.tok = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
39
+ self.pos = nn.Embedding(max_len, d_model)
40
+
41
+ self.register_buffer(
42
+ "position_ids",
43
+ torch.arange(max_len).unsqueeze(0),
44
+ persistent=False,
45
+ )
46
+
47
+ enc_layer = nn.TransformerEncoderLayer(
48
+ d_model=d_model,
49
+ nhead=nhead,
50
+ dim_feedforward=ffn_mult * d_model,
51
+ dropout=dropout,
52
+ batch_first=True,
53
+ norm_first=True,
54
+ )
55
+
56
+ self.enc = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
57
+ self.norm = nn.LayerNorm(d_model)
58
+ self.clf = nn.Linear(d_model, NUM_CLASSES)
59
+
60
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
61
+ """Forward pass through the encoder."""
62
+ b, L = x.shape
63
+ pos = self.position_ids[:, :L].expand(b, L)
64
+ h = self.tok(x) + self.pos(pos)
65
+ h = self.enc(h)
66
+ cls = self.norm(h[:, 0])
67
+ return self.clf(cls)
68
+
69
+
70
+ class DGAEncoderConfig(PretrainedConfig):
71
+ """Configuration for DGAEncoder compatible with HuggingFace Transformers."""
72
+
73
+ model_type = "dga_encoder"
74
+
75
+ def __init__(
76
+ self,
77
+ vocab_size: int = VOCAB_SIZE,
78
+ max_len: int = 64,
79
+ d_model: int = 256,
80
+ nhead: int = 8,
81
+ num_layers: int = 4,
82
+ dropout: float = 0.1,
83
+ ffn_mult: int = 4,
84
+ num_labels: int = 2,
85
+ **kwargs,
86
+ ):
87
+ super().__init__(**kwargs)
88
+ self.vocab_size = vocab_size
89
+ self.max_len = max_len
90
+ self.d_model = d_model
91
+ self.nhead = nhead
92
+ self.num_layers = num_layers
93
+ self.dropout = dropout
94
+ self.ffn_mult = ffn_mult
95
+ self.num_labels = num_labels
96
+
97
+
98
+ class DGAEncoderForSequenceClassification(PreTrainedModel):
99
+ """HuggingFace-compatible wrapper around DGAEncoder."""
100
+
101
+ config_class = DGAEncoderConfig
102
+
103
+ def __init__(self, config: DGAEncoderConfig):
104
+ super().__init__(config)
105
+ self.config = config
106
+
107
+ self.encoder = DGAEncoder(
108
+ vocab_size=config.vocab_size,
109
+ max_len=config.max_len,
110
+ d_model=config.d_model,
111
+ nhead=config.nhead,
112
+ num_layers=config.num_layers,
113
+ dropout=config.dropout,
114
+ ffn_mult=config.ffn_mult,
115
+ )
116
+
117
+ self.post_init()
118
+
119
+ def forward(
120
+ self,
121
+ input_ids: torch.Tensor,
122
+ attention_mask: Optional[torch.Tensor] = None,
123
+ labels: Optional[torch.Tensor] = None,
124
+ return_dict: Optional[bool] = None,
125
+ **kwargs,
126
+ ):
127
+ """Forward pass compatible with HF Trainer."""
128
+ return_dict = (
129
+ return_dict
130
+ if return_dict is not None
131
+ else self.config.use_return_dict
132
+ )
133
+
134
+ logits = self.encoder(input_ids)
135
+
136
+ loss = None
137
+ if labels is not None:
138
+ loss_fct = nn.CrossEntropyLoss()
139
+ loss = loss_fct(
140
+ logits.view(-1, self.config.num_labels), labels.view(-1)
141
+ )
142
+
143
+ if not return_dict:
144
+ output = (logits,)
145
+ return ((loss,) + output) if loss is not None else output
146
+
147
+ return SequenceClassifierOutput(
148
+ loss=loss,
149
+ logits=logits,
150
+ hidden_states=None,
151
+ attentions=None,
152
+ )