#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
import joblib
# In[2]:
data1 = pd.read_csv("Monday-WorkingHours.pcap_ISCX.csv")
data2 = pd.read_csv("Tuesday-WorkingHours.pcap_ISCX.csv")
data3 = pd.read_csv("Wednesday-workingHours.pcap_ISCX.csv")
data4 = pd.read_csv("Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
data5 = pd.read_csv("Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
data6 = pd.read_csv("Friday-WorkingHours-Morning.pcap_ISCX.csv")
data7 = pd.read_csv("Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
data8 = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
# In[3]:
data_list = [data1, data2, data3, data4, data5, data6, data7, data8]
# In[4]:
selected_features = [
# Flow-based features
'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
# Timing features
'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
'Fwd IAT Total', 'Bwd IAT Total',
# Packet characteristics
'Fwd Packet Length Max', 'Fwd Packet Length Min',
'Bwd Packet Length Max', 'Bwd Packet Length Min',
'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
# TCP flags
'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count',
'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
# Additional features
'Total Fwd Packets', 'Total Backward Packets',
'Fwd Header Length', 'Bwd Header Length',
'Active Mean', 'Active Std', 'Idle Mean',
'Init_Win_bytes_forward', 'Init_Win_bytes_backward'
]
# In[5]:
for i, data in enumerate(data_list, start=1):
rows, cols = data.shape
print(f'Data{i} -> {rows} rows, {cols} columns')
# In[6]:
data = pd.concat(data_list)
rows, cols = data.shape
# In[7]:
print('\nCombined dataset dimensions:')
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')
print(f'Total cells: {rows * cols}')
# In[8]:
for d in data_list:
del d
# In[9]:
# Clean column names and remove duplicates
col_names = {col: col.strip() for col in data.columns}
data.rename(columns=col_names, inplace=True)
# In[10]:
# Check and remove duplicates
initial_rows = len(data)
data.drop_duplicates(inplace=True)
print(f'\nRemoved {initial_rows - len(data)} duplicates')
# In[11]:
# Check missing values
missing_val = data.isna().sum()
print('\nColumns with missing values:')
print(missing_val.loc[missing_val > 0])
# In[12]:
# Handle infinite values
numeric_cols = data.select_dtypes(include=np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print('\nColumns with infinite values:')
print(inf_count[inf_count > 0])
# In[13]:
# Replace infinite values with NaN
print(f'\nInitial missing values: {data.isna().sum().sum()}')
data.replace([np.inf, -np.inf], np.nan, inplace=True)
print(f'Missing values after processing infinite values: {data.isna().sum().sum()}')
# In[14]:
# Fill missing values with median
for col in numeric_cols:
if data[col].isnull().any():
median_val = data[col].median()
data[col].fillna(median_val, inplace=True)
# In[15]:
# Map attack types
attack_map = {
'BENIGN': 'BENIGN',
'DDoS': 'DDoS',
'DoS Hulk': 'DoS',
'DoS GoldenEye': 'DoS',
'DoS slowloris': 'DoS',
'DoS Slowhttptest': 'DoS',
'PortScan': 'Port Scan',
'FTP-Patator': 'Brute Force',
'SSH-Patator': 'Brute Force',
'Bot': 'Bot',
'Web Attack � Brute Force': 'Web Attack',
'Web Attack � XSS': 'Web Attack',
'Web Attack � Sql Injection': 'Web Attack',
'Infiltration': 'Infiltration',
'Heartbleed': 'Heartbleed'
}
# In[16]:
data['Attack Type'] = data['Label'].map(attack_map)
data.drop('Label', axis=1, inplace=True)
# In[17]:
# Display attack distribution
print('\nAttack type distribution:')
print(data['Attack Type'].value_counts())
# In[18]:
class_counts = data['Attack Type'].value_counts()
selected_classes = class_counts[class_counts > 1950]
class_names = selected_classes.index
selected = data[data['Attack Type'].isin(class_names)]
dfs = []
for name in class_names:
df = selected[selected['Attack Type'] == name]
if len(df) > 2500:
df = df.sample(n=5000, random_state=0)
dfs.append(df)
data = pd.concat(dfs, ignore_index=True)
print('\nBalanced attack type distribution:')
print(data['Attack Type'].value_counts())
# In[19]:
# Plot attack distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='Attack Type')
plt.xticks(rotation=45)
plt.title('Distribution of Attack Types')
plt.tight_layout()
plt.show()
# In[20]:
# Prepare features
X = data[selected_features]
y = data['Attack Type']
# In[21]:
# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=selected_features)
# In[22]:
# Plot correlation matrix for scaled features
plt.figure(figsize=(20, 16))
sns.heatmap(X_scaled.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Selected Features')
plt.tight_layout()
plt.show()
# In[23]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.3, random_state=42, stratify=y
)
# In[24]:
print('\nTraining set shape:', X_train.shape)
print('Testing set shape:', X_test.shape)
# In[25]:
target_samples = {
'BENIGN': 30000, # zwiększ reprezentację normalnego ruchu
'DoS': 40000, # zmniejsz dominację
'DDoS': 40000, # zmniejsz dominację
'Port Scan': 40000, # zbalansuj względem innych ataków
'Brute Force': 40000, # zwiększ reprezentację
'Web Attack': 40000, # zwiększ reprezentację
'Bot': 40000 # zwiększ reprezentację
}
sampler = SMOTE(sampling_strategy=target_samples, random_state=42)
X_train_resampled, y_train_resampled= sampler.fit_resample(X_train, y_train)
print('\nKształt zbioru treningowego po resamplingu:', X_train_resampled.shape)
print('\nRozkład klas po resamplingu:')
print(pd.Series(y_train_resampled).value_counts())
# In[27]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_resampled)
y_test_encoded = le.transform(y_test)
# In[28]:
import xgboost as xgb
# In[29]:
# In[30]:
xgb_model = xgb.XGBClassifier(
learning_rate=0.01,
n_estimators=1000,
max_depth=8,
min_child_weight=5,
gamma=0.3,
subsample=0.8,
colsample_bytree=0.8,
objective='multi:softprob',
num_class=len(np.unique(y_train_encoded)),
tree_method='gpu_hist', # GPU-akcelerowane trenowanie
eval_metric=['mlogloss', 'merror'],
random_state=42,
verbosity=1,
early_stopping=20
)
# In[31]:
xgb_model.fit(
X_train_resampled,
y_train_encoded,
eval_set=[(X_test, y_test_encoded)],
verbose=True
)
# In[32]:
y_pred = xgb_model.predict(X_test)
y_pred_decoded = le.inverse_transform(y_pred)
# Wyniki
print('\nXGBoost Results:')
print(classification_report(y_test, y_pred_decoded))
# In[33]:
# Zapisz model do pliku
xgb_model.save_model('xgboost_model3.json')
# In[34]:
plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_decoded),
annot=True, fmt='d', cmap='Blues',
xticklabels=le.classes_,
yticklabels=le.classes_)
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()
# In[35]:
from catboost import CatBoostClassifier
# In[37]:
cat_model = CatBoostClassifier(
iterations=1500, # zwiększona liczba iteracji
learning_rate=0.025, # zoptymalizowany learning rate
depth=14, # zmniejszona głębokość
l2_leaf_reg=4, # zwiększona regularyzacja
loss_function='MultiClass',
eval_metric='MultiClass',
random_seed=42,
verbose=100,
early_stopping_rounds=30,
auto_class_weights='Balanced',
task_type='GPU',
leaf_estimation_iterations=10, # dodany parametr
)
# In[ ]:
cat_model.fit(
X_train_resampled,
y_train_encoded,
eval_set=[(X_test, y_test_encoded)],
use_best_model=True,
plot=True
)
# In[99]:
y_pred = cat_model.predict(X_test)
# In[100]:
print('\nCatBoost Results:')
y_pred_decoded = le.inverse_transform(y_pred) # Dekodowanie y_pred
print(classification_report(y_test, y_pred_decoded))
# In[101]:
class_names = ['BENIGN', 'Bot', 'Brute Force', 'DDoS', 'DoS', 'Port Scan', 'Web Attack']
plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_decoded),
annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, # Używamy nazw klas zamiast indeksów
yticklabels=class_names)
plt.title('CatBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(ha='right') # Obracamy etykiety dla lepszej czytelności
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# In[102]:
cat_model.save_model('catboost_model3.cbm', format='cbm', export_parameters=None, pool=None)
# In[103]:
def plot_feature_importance(model, feature_names):
importance = model.feature_importances_
sorted_idx = np.argsort(importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(12, 6))
plt.barh(pos, importance[sorted_idx])
plt.yticks(pos, feature_names[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance (Top 20)')
plt.show()
# In[114]:
rf = RandomForestClassifier(
n_estimators = 300,
max_depth = None,
max_features = 20,
random_state = 42,
n_jobs=-1
)
# In[115]:
rf.fit(X_train_resampled, y_train_encoded)
# In[116]:
y_pred_rf = rf.predict(X_test)
y_pred_rf_decoded = le.inverse_transform(y_pred_rf)
acc_rf = accuracy_score(y_test, y_pred_rf_decoded)
print(f"\nDokładność Random Forest: {acc_rf:.4f}")
print("\nRaport klasyfikacji Random Forest:")
print(classification_report(y_test, y_pred_rf_decoded))
# In[117]:
plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_rf_decoded),
annot=True, fmt='d', cmap='Blues',
xticklabels=le.classes_,
yticklabels=le.classes_)
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()
# In[ ]:
joblib.dump(rf, 'random_forest_model3.pkl')
# In[ ]:
joblib.dump(scaler, 'scaler_3.joblib')
# In[ ]:
joblib.dump(le, 'label_encoder_3.joblib')
I trained my models in this way. On the training data they give good results but I have problems testing them on the data on which they should work with the suricata logs. All attacks predicate as normal BENIGN traffic.
import numpy as np
import pandas as pd
import json
import joblib
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import RobustScaler
from datetime import datetime
import dateutil.parser
from collections import defaultdict
import warnings
import traceback
import json
from pathlib import Path
warnings.filterwarnings('ignore')
def load_and_normalize_events(file_path):
"""Load events from Suricata eve.json and normalize data."""
print("Loading events...")
flows = defaultdict(list)
total_events = 0
event_types = defaultdict(int)
skipped_lines = 0
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
for line_num, line in enumerate(f, 1):
try:
event = json.loads(line)
# Count event types
event_type = event.get('event_type', 'unknown')
event_types[event_type] += 1
# Store flow events
if 'flow_id' in event:
flow_id = str(event['flow_id'])
flows[flow_id].append(event)
total_events += 1
if total_events % 100000 == 0:
print(f"Loaded {total_events} flow events...")
except json.JSONDecodeError as e:
skipped_lines += 1
print(f"Skipped invalid JSON at line {line_num}: {str(e)}")
continue
except Exception as e:
skipped_lines += 1
print(f"Error processing line {line_num}: {str(e)}")
continue
except Exception as e:
print(f"Error reading file: {str(e)}")
return flows
print("\nEvent Type Statistics:")
for event_type, count in event_types.items():
print(f"{event_type}: {count}")
print(f"\nTotal events processed: {sum(event_types.values())}")
print(f"Total flow events: {total_events}")
print(f"Unique flows: {len(flows)}")
print(f"Skipped lines: {skipped_lines}")
return flows
def extract_flow_features(events):
"""Extract features from a group of events belonging to same flow."""
features = {
'Flow Duration': 0.0,
'Flow Bytes/s': 0.0,
'Flow Packets/s': 0.0,
'Total Length of Fwd Packets': 0.0,
'Total Length of Bwd Packets': 0.0,
'Flow IAT Mean': 0.0,
'Flow IAT Std': 0.0,
'Flow IAT Max': 0.0,
'Flow IAT Min': 0.0,
'Fwd IAT Total': 0.0,
'Bwd IAT Total': 0.0,
'Fwd Packet Length Max': 0.0,
'Fwd Packet Length Min': 0.0,
'Bwd Packet Length Max': 0.0,
'Bwd Packet Length Min': 0.0,
'Packet Length Mean': 0.0,
'Packet Length Std': 0.0,
'Packet Length Variance': 0.0,
'SYN Flag Count': 0.0,
'FIN Flag Count': 0.0,
'RST Flag Count': 0.0,
'PSH Flag Count': 0.0,
'ACK Flag Count': 0.0,
'URG Flag Count': 0.0,
'Total Fwd Packets': 0.0,
'Total Backward Packets': 0.0,
'Fwd Header Length': 0.0,
'Bwd Header Length': 0.0,
'Active Mean': 0.0,
'Active Std': 0.0,
'Idle Mean': 0.0,
'Init_Win_bytes_forward': 0.0,
'Init_Win_bytes_backward': 0.0
}
try:
# Get first event for source/destination info
first_event = events[0]
src_ip = first_event.get('src_ip')
# Collect timestamps
timestamps = []
packet_lengths = []
fwd_packet_lengths = []
bwd_packet_lengths = []
total_bytes_fwd = 0
total_bytes_bwd = 0
total_packets_fwd = 0
total_packets_bwd = 0
# Process each event
for event in events:
# Flow statistics
if 'flow' in event and isinstance(event['flow'], dict):
flow = event['flow']
if event.get('src_ip') == src_ip:
total_bytes_fwd += float(flow.get('bytes_toserver', 0))
total_packets_fwd += float(flow.get('pkts_toserver', 0))
else:
total_bytes_bwd += float(flow.get('bytes_toclient', 0))
total_packets_bwd += float(flow.get('pkts_toclient', 0))
# TCP flags
if 'tcp' in event and isinstance(event['tcp'], dict):
tcp = event['tcp']
flags = str(tcp.get('tcp_flags', ''))
features['SYN Flag Count'] += 1.0 if 'S' in flags else 0.0
features['FIN Flag Count'] += 1.0 if 'F' in flags else 0.0
features['RST Flag Count'] += 1.0 if 'R' in flags else 0.0
features['PSH Flag Count'] += 1.0 if 'P' in flags else 0.0
features['ACK Flag Count'] += 1.0 if 'A' in flags else 0.0
features['URG Flag Count'] += 1.0 if 'U' in flags else 0.0
# Window sizes
if 'window' in tcp:
window_size = float(tcp['window'])
if event.get('src_ip') == src_ip:
features['Init_Win_bytes_forward'] = max(
features['Init_Win_bytes_forward'],
window_size
)
else:
features['Init_Win_bytes_backward'] = max(
features['Init_Win_bytes_backward'],
window_size
)
# Timestamp for duration calculation
if 'timestamp' in event:
ts = dateutil.parser.parse(event['timestamp'])
if ts:
timestamps.append(ts)
# Packet length statistics
if 'payload_printable' in event:
length = len(event['payload_printable'])
packet_lengths.append(length)
if event.get('src_ip') == src_ip:
fwd_packet_lengths.append(length)
else:
bwd_packet_lengths.append(length)
# Calculate time-based features
if timestamps:
timestamps.sort()
duration = (timestamps[-1] - timestamps[0]).total_seconds()
features['Flow Duration'] = duration
if duration > 0:
features['Flow Bytes/s'] = (total_bytes_fwd + total_bytes_bwd) / duration
features['Flow Packets/s'] = (total_packets_fwd + total_packets_bwd) / duration
# Inter-arrival times
if len(timestamps) > 1:
iats = [(timestamps[i+1] - timestamps[i]).total_seconds()
for i in range(len(timestamps)-1)]
features['Flow IAT Mean'] = np.mean(iats)
features['Flow IAT Std'] = np.std(iats)
features['Flow IAT Max'] = np.max(iats)
features['Flow IAT Min'] = np.min(iats)
# Packet statistics
features['Total Fwd Packets'] = total_packets_fwd
features['Total Backward Packets'] = total_packets_bwd
features['Total Length of Fwd Packets'] = total_bytes_fwd
features['Total Length of Bwd Packets'] = total_bytes_bwd
if packet_lengths:
features['Packet Length Mean'] = np.mean(packet_lengths)
features['Packet Length Std'] = np.std(packet_lengths)
features['Packet Length Variance'] = np.var(packet_lengths)
if fwd_packet_lengths:
features['Fwd Packet Length Max'] = max(fwd_packet_lengths)
features['Fwd Packet Length Min'] = min(fwd_packet_lengths)
if bwd_packet_lengths:
features['Bwd Packet Length Max'] = max(bwd_packet_lengths)
features['Bwd Packet Length Min'] = min(bwd_packet_lengths)
except Exception as e:
print(f"Error extracting features: {str(e)}")
return features
def process_flows(flows):
"""Process all flows and extract features."""
features_list = []
processed = 0
total_flows = len(flows)
print("Processing flows...")
for flow_id, events in flows.items():
if processed % 10000 == 0:
print(f"Processed {processed}/{total_flows} flows...")
features = extract_flow_features(events)
features['flow_id'] = flow_id
features_list.append(features)
processed += 1
return pd.DataFrame(features_list)
def main():
# Load models
print("Loading models...")
try:
models = {
'xgboost': xgb.Booster(model_file='xgboost_model3.json'),
'catboost': CatBoostClassifier().load_model('catboost_model3.cbm'),
'random_forest': joblib.load('random_forest_model3.pkl'),
'scaler': joblib.load('scaler_3.joblib'),
'label_encoder': joblib.load('label_encoder_3.joblib')
}
except Exception as e:
print(f"Error loading models: {str(e)}")
return
# Load and process events
flows = load_and_normalize_events('eve.json')
# Extract features
features_df = process_flows(flows)
# Prepare features for prediction
flow_ids = features_df['flow_id']
features_df = features_df.drop('flow_id', axis=1)
# Scale features
print("\nScaling features...")
scaled_features = models['scaler'].transform(features_df)
# Make predictions
print("Making predictions...")
predictions = {}
# XGBoost
dmatrix = xgb.DMatrix(scaled_features, feature_names=features_df.columns.tolist())
xgb_pred_prob = models['xgboost'].predict(dmatrix)
predictions['XGBoost'] = models['label_encoder'].inverse_transform(
np.argmax(xgb_pred_prob, axis=1)
)
predictions['XGBoost_confidence'] = np.max(xgb_pred_prob, axis=1)
# CatBoost
cat_pred_prob = models['catboost'].predict_proba(scaled_features)
predictions['CatBoost'] = models['label_encoder'].inverse_transform(
np.argmax(cat_pred_prob, axis=1)
)
predictions['CatBoost_confidence'] = np.max(cat_pred_prob, axis=1)
# Random Forest
rf_pred_prob = models['random_forest'].predict_proba(scaled_features)
predictions['RandomForest'] = models['label_encoder'].inverse_transform(
np.argmax(rf_pred_prob, axis=1)
)
predictions['RandomForest_confidence'] = np.max(rf_pred_prob, axis=1)
# Combine results
results = pd.DataFrame(predictions)
results['flow_id'] = flow_ids
results['consensus'] = results[['XGBoost', 'CatBoost', 'RandomForest']].mode(axis=1)[0]
results['avg_confidence'] = results[
['XGBoost_confidence', 'CatBoost_confidence', 'RandomForest_confidence']
].mean(axis=1)
# Save results
results.to_csv('predictions_detailed.csv', index=False)
print("\nResults saved to predictions_detailed.csv")
# Print summary
print("\nPrediction Summary by Model:")
for model in ['XGBoost', 'CatBoost', 'RandomForest', 'consensus']:
print(f"\n{model} predictions:")
counts = results[model].value_counts()
percentages = results[model].value_counts(normalize=True) * 100
for label, count in counts.items():
print(f"{label}: {count} ({percentages[label]:.2f}%)")
# Print high confidence predictions
print("\nHigh Confidence Predictions (>0.9):")
high_conf = results[results['avg_confidence'] > 0.9]
print(high_conf['consensus'].value_counts())
return results, features_df
if __name__ == "__main__":
main()
I tried to test them in this way. I simply copied the data from suricata, which is the identical eve.json file that suricata creates.