Update README.md
Browse files
README.md
CHANGED
|
@@ -26,39 +26,28 @@ The model was trained on a multilingual dataset of cybersecurity and non-cyberse
|
|
| 26 |
|
| 27 |
```python
|
| 28 |
from sentence_transformers import SentenceTransformer
|
| 29 |
-
from sklearn.model_selection import train_test_split
|
| 30 |
-
from sklearn.preprocessing import LabelEncoder
|
| 31 |
-
import pandas as pd
|
| 32 |
-
import joblib
|
| 33 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 34 |
|
| 35 |
-
# Load
|
| 36 |
-
df = pd.read_csv("your_dataset.csv") # Requires 'clean_text' and 'label' columns
|
| 37 |
-
|
| 38 |
-
# Load the sentence transformer
|
| 39 |
embedder = SentenceTransformer("intfloat/multilingual-e5-large")
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
df["label"],
|
| 45 |
-
test_size=0.05,
|
| 46 |
-
stratify=df["label"],
|
| 47 |
-
random_state=42
|
| 48 |
-
)
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
|
| 55 |
-
# Generate
|
| 56 |
-
|
| 57 |
-
X_test_emb = embedder.encode(X_test.tolist(), convert_to_numpy=True, show_progress_bar=True)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
|
| 61 |
-
model = joblib.load(model_path)
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
|
|
|
|
|
| 26 |
|
| 27 |
```python
|
| 28 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
from huggingface_hub import hf_hub_download
|
| 30 |
+
import joblib
|
| 31 |
|
| 32 |
+
# 1. Load the embedding model
|
|
|
|
|
|
|
|
|
|
| 33 |
embedder = SentenceTransformer("intfloat/multilingual-e5-large")
|
| 34 |
|
| 35 |
+
# 2. Load the pretrained MLP classifier from Hugging Face Hub
|
| 36 |
+
model_path = hf_hub_download(repo_id="selfconstruct3d/cybersec_classifier", filename="cybersec_classifier.pkl")
|
| 37 |
+
model = joblib.load(model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# 3. Example input texts (can be in English or German)
|
| 40 |
+
texts = [
|
| 41 |
+
"A new ransomware attack has affected critical infrastructure in Germany.",
|
| 42 |
+
"The local sports club hosted its annual summer festival this weekend."
|
| 43 |
+
]
|
| 44 |
|
| 45 |
+
# 4. Generate embeddings
|
| 46 |
+
embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
|
|
| 47 |
|
| 48 |
+
# 5. Predict cybersecurity relevance
|
| 49 |
+
predictions = model.predict(embeddings)
|
|
|
|
| 50 |
|
| 51 |
+
# 6. Output results
|
| 52 |
+
for text, label in zip(texts, predictions):
|
| 53 |
+
print(f"Text: {text}\nPrediction: {label}\n")
|