How to test models trained on CICIDS2017?

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
import joblib



# In[2]:


data1 = pd.read_csv("Monday-WorkingHours.pcap_ISCX.csv")
data2 = pd.read_csv("Tuesday-WorkingHours.pcap_ISCX.csv")
data3 = pd.read_csv("Wednesday-workingHours.pcap_ISCX.csv")
data4 = pd.read_csv("Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
data5 = pd.read_csv("Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
data6 = pd.read_csv("Friday-WorkingHours-Morning.pcap_ISCX.csv")
data7 = pd.read_csv("Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
data8 = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")


# In[3]:


data_list = [data1, data2, data3, data4, data5, data6, data7, data8]


# In[4]:


selected_features = [
    # Flow-based features
    'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    
    # Timing features
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Total', 'Bwd IAT Total',
    
    # Packet characteristics
    'Fwd Packet Length Max', 'Fwd Packet Length Min',
    'Bwd Packet Length Max', 'Bwd Packet Length Min',
    'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
    
    # TCP flags
    'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count',
    'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
    
    # Additional features
    'Total Fwd Packets', 'Total Backward Packets',
    'Fwd Header Length', 'Bwd Header Length',
    'Active Mean', 'Active Std', 'Idle Mean',
    'Init_Win_bytes_forward', 'Init_Win_bytes_backward'
]


# In[5]:


for i, data in enumerate(data_list, start=1):
    rows, cols = data.shape
    print(f'Data{i} -> {rows} rows, {cols} columns')


# In[6]:


data = pd.concat(data_list)
rows, cols = data.shape


# In[7]:


print('\nCombined dataset dimensions:')
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')
print(f'Total cells: {rows * cols}')


# In[8]:


for d in data_list: 
    del d


# In[9]:


# Clean column names and remove duplicates
col_names = {col: col.strip() for col in data.columns}
data.rename(columns=col_names, inplace=True)


# In[10]:


# Check and remove duplicates
initial_rows = len(data)
data.drop_duplicates(inplace=True)
print(f'\nRemoved {initial_rows - len(data)} duplicates')


# In[11]:


# Check missing values
missing_val = data.isna().sum()
print('\nColumns with missing values:')
print(missing_val.loc[missing_val > 0])


# In[12]:


# Handle infinite values
numeric_cols = data.select_dtypes(include=np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print('\nColumns with infinite values:')
print(inf_count[inf_count > 0])


# In[13]:


# Replace infinite values with NaN
print(f'\nInitial missing values: {data.isna().sum().sum()}')
data.replace([np.inf, -np.inf], np.nan, inplace=True)
print(f'Missing values after processing infinite values: {data.isna().sum().sum()}')


# In[14]:


# Fill missing values with median
for col in numeric_cols:
    if data[col].isnull().any():
        median_val = data[col].median()
        data[col].fillna(median_val, inplace=True)


# In[15]:


# Map attack types
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bot',
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}


# In[16]:


data['Attack Type'] = data['Label'].map(attack_map)
data.drop('Label', axis=1, inplace=True)


# In[17]:


# Display attack distribution
print('\nAttack type distribution:')
print(data['Attack Type'].value_counts())


# In[18]:


class_counts = data['Attack Type'].value_counts()
selected_classes = class_counts[class_counts > 1950]
class_names = selected_classes.index
selected = data[data['Attack Type'].isin(class_names)]

dfs = []
for name in class_names:
    df = selected[selected['Attack Type'] == name]
    if len(df) > 2500:
        df = df.sample(n=5000, random_state=0)
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
print('\nBalanced attack type distribution:')
print(data['Attack Type'].value_counts())


# In[19]:


# Plot attack distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='Attack Type')
plt.xticks(rotation=45)
plt.title('Distribution of Attack Types')
plt.tight_layout()
plt.show()


# In[20]:


# Prepare features
X = data[selected_features]
y = data['Attack Type']


# In[21]:


# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=selected_features)


# In[22]:


# Plot correlation matrix for scaled features
plt.figure(figsize=(20, 16))
sns.heatmap(X_scaled.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Selected Features')
plt.tight_layout()
plt.show()


# In[23]:


# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)


# In[24]:


print('\nTraining set shape:', X_train.shape)
print('Testing set shape:', X_test.shape)


# In[25]:


target_samples = {
    'BENIGN': 30000,      # zwiększ reprezentację normalnego ruchu
    'DoS': 40000,         # zmniejsz dominację
    'DDoS': 40000,        # zmniejsz dominację
    'Port Scan': 40000,   # zbalansuj względem innych ataków
    'Brute Force': 40000, # zwiększ reprezentację
    'Web Attack': 40000,  # zwiększ reprezentację
    'Bot': 40000         # zwiększ reprezentację
}

sampler = SMOTE(sampling_strategy=target_samples, random_state=42)
X_train_resampled, y_train_resampled= sampler.fit_resample(X_train, y_train)
print('\nKształt zbioru treningowego po resamplingu:', X_train_resampled.shape)
print('\nRozkład klas po resamplingu:')
print(pd.Series(y_train_resampled).value_counts())


# In[27]:


le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_resampled)
y_test_encoded = le.transform(y_test)


# In[28]:


import xgboost as xgb


# In[29]:


# In[30]:


xgb_model = xgb.XGBClassifier(
    learning_rate=0.01,     
    n_estimators=1000,     
    max_depth=8,    
    min_child_weight=5,     
    gamma=0.3,     
    subsample=0.8,     
    colsample_bytree=0.8,     
    objective='multi:softprob',     
    num_class=len(np.unique(y_train_encoded)),     
    tree_method='gpu_hist',  # GPU-akcelerowane trenowanie
    eval_metric=['mlogloss', 'merror'],       
    random_state=42,     
    verbosity=1,     
    early_stopping=20 
)


# In[31]:


xgb_model.fit(
    X_train_resampled, 
    y_train_encoded,
    eval_set=[(X_test, y_test_encoded)],
    verbose=True
)


# In[32]:


y_pred = xgb_model.predict(X_test)
y_pred_decoded = le.inverse_transform(y_pred)

# Wyniki
print('\nXGBoost Results:')
print(classification_report(y_test, y_pred_decoded))


# In[33]:


# Zapisz model do pliku
xgb_model.save_model('xgboost_model3.json')  


# In[34]:


plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_decoded), 
            annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()


# In[35]:


from catboost import CatBoostClassifier




# In[37]:


cat_model = CatBoostClassifier(
    iterations=1500,          # zwiększona liczba iteracji
    learning_rate=0.025,       # zoptymalizowany learning rate
    depth=14,                 # zmniejszona głębokość
    l2_leaf_reg=4,           # zwiększona regularyzacja
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=30,
    auto_class_weights='Balanced',
    task_type='GPU',
    leaf_estimation_iterations=10,  # dodany parametr
)


# In[ ]:


cat_model.fit(
    X_train_resampled,
    y_train_encoded,
    eval_set=[(X_test, y_test_encoded)],
    use_best_model=True,
    plot=True
)


# In[99]:


y_pred = cat_model.predict(X_test)


# In[100]:


print('\nCatBoost Results:')
y_pred_decoded = le.inverse_transform(y_pred)  # Dekodowanie y_pred
print(classification_report(y_test, y_pred_decoded))


# In[101]:


class_names = ['BENIGN', 'Bot', 'Brute Force', 'DDoS', 'DoS', 'Port Scan', 'Web Attack']

plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_decoded),  
            annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,  # Używamy nazw klas zamiast indeksów
            yticklabels=class_names)
plt.title('CatBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(ha='right')  # Obracamy etykiety dla lepszej czytelności
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


# In[102]:



cat_model.save_model('catboost_model3.cbm', format='cbm', export_parameters=None, pool=None)



# In[103]:


def plot_feature_importance(model, feature_names):
    importance = model.feature_importances_
    sorted_idx = np.argsort(importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    plt.figure(figsize=(12, 6))
    plt.barh(pos, importance[sorted_idx])
    plt.yticks(pos, feature_names[sorted_idx])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance (Top 20)')
    plt.show()


# In[114]:


rf = RandomForestClassifier(
    n_estimators = 300, 
    max_depth = None, 
    max_features = 20, 
    random_state = 42, 
    n_jobs=-1
    )


# In[115]:


rf.fit(X_train_resampled, y_train_encoded)


# In[116]:


y_pred_rf = rf.predict(X_test)

y_pred_rf_decoded = le.inverse_transform(y_pred_rf)

acc_rf = accuracy_score(y_test, y_pred_rf_decoded)
print(f"\nDokładność Random Forest: {acc_rf:.4f}")

print("\nRaport klasyfikacji Random Forest:")
print(classification_report(y_test, y_pred_rf_decoded))


# In[117]:


plt.figure(figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, y_pred_rf_decoded),  
            annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()


# In[ ]:


joblib.dump(rf, 'random_forest_model3.pkl')


# In[ ]:


joblib.dump(scaler, 'scaler_3.joblib')


# In[ ]:


joblib.dump(le, 'label_encoder_3.joblib')

I trained my models in this way. On the training data they give good results but I have problems testing them on the data on which they should work with the suricata logs. All attacks predicate as normal BENIGN traffic.

import numpy as np
import pandas as pd
import json
import joblib
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import RobustScaler
from datetime import datetime
import dateutil.parser
from collections import defaultdict
import warnings
import traceback
import json
from pathlib import Path

warnings.filterwarnings('ignore')

def load_and_normalize_events(file_path):
    """Load events from Suricata eve.json and normalize data."""
    print("Loading events...")
    flows = defaultdict(list)
    total_events = 0
    event_types = defaultdict(int)
    skipped_lines = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    event = json.loads(line)
                    # Count event types
                    event_type = event.get('event_type', 'unknown')
                    event_types[event_type] += 1
                    
                    # Store flow events
                    if 'flow_id' in event:
                        flow_id = str(event['flow_id'])
                        flows[flow_id].append(event)
                        total_events += 1
                        
                        if total_events % 100000 == 0:
                            print(f"Loaded {total_events} flow events...")
                except json.JSONDecodeError as e:
                    skipped_lines += 1
                    print(f"Skipped invalid JSON at line {line_num}: {str(e)}")
                    continue
                except Exception as e:
                    skipped_lines += 1
                    print(f"Error processing line {line_num}: {str(e)}")
                    continue
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return flows
    
    print("\nEvent Type Statistics:")
    for event_type, count in event_types.items():
        print(f"{event_type}: {count}")
    
    print(f"\nTotal events processed: {sum(event_types.values())}")
    print(f"Total flow events: {total_events}")
    print(f"Unique flows: {len(flows)}")
    print(f"Skipped lines: {skipped_lines}")
    
    return flows

def extract_flow_features(events):
    """Extract features from a group of events belonging to same flow."""
    features = {
        'Flow Duration': 0.0,
        'Flow Bytes/s': 0.0,
        'Flow Packets/s': 0.0,
        'Total Length of Fwd Packets': 0.0,
        'Total Length of Bwd Packets': 0.0,
        'Flow IAT Mean': 0.0,
        'Flow IAT Std': 0.0,
        'Flow IAT Max': 0.0,
        'Flow IAT Min': 0.0,
        'Fwd IAT Total': 0.0,
        'Bwd IAT Total': 0.0,
        'Fwd Packet Length Max': 0.0,
        'Fwd Packet Length Min': 0.0,
        'Bwd Packet Length Max': 0.0,
        'Bwd Packet Length Min': 0.0,
        'Packet Length Mean': 0.0,
        'Packet Length Std': 0.0,
        'Packet Length Variance': 0.0,
        'SYN Flag Count': 0.0,
        'FIN Flag Count': 0.0,
        'RST Flag Count': 0.0,
        'PSH Flag Count': 0.0,
        'ACK Flag Count': 0.0,
        'URG Flag Count': 0.0,
        'Total Fwd Packets': 0.0,
        'Total Backward Packets': 0.0,
        'Fwd Header Length': 0.0,
        'Bwd Header Length': 0.0,
        'Active Mean': 0.0,
        'Active Std': 0.0,
        'Idle Mean': 0.0,
        'Init_Win_bytes_forward': 0.0,
        'Init_Win_bytes_backward': 0.0
    }
    
    try:
        # Get first event for source/destination info
        first_event = events[0]
        src_ip = first_event.get('src_ip')
        
        # Collect timestamps
        timestamps = []
        packet_lengths = []
        fwd_packet_lengths = []
        bwd_packet_lengths = []
        
        total_bytes_fwd = 0
        total_bytes_bwd = 0
        total_packets_fwd = 0
        total_packets_bwd = 0
        
        # Process each event
        for event in events:
            # Flow statistics
            if 'flow' in event and isinstance(event['flow'], dict):
                flow = event['flow']
                if event.get('src_ip') == src_ip:
                    total_bytes_fwd += float(flow.get('bytes_toserver', 0))
                    total_packets_fwd += float(flow.get('pkts_toserver', 0))
                else:
                    total_bytes_bwd += float(flow.get('bytes_toclient', 0))
                    total_packets_bwd += float(flow.get('pkts_toclient', 0))
            
            # TCP flags
            if 'tcp' in event and isinstance(event['tcp'], dict):
                tcp = event['tcp']
                flags = str(tcp.get('tcp_flags', ''))
                features['SYN Flag Count'] += 1.0 if 'S' in flags else 0.0
                features['FIN Flag Count'] += 1.0 if 'F' in flags else 0.0
                features['RST Flag Count'] += 1.0 if 'R' in flags else 0.0
                features['PSH Flag Count'] += 1.0 if 'P' in flags else 0.0
                features['ACK Flag Count'] += 1.0 if 'A' in flags else 0.0
                features['URG Flag Count'] += 1.0 if 'U' in flags else 0.0
                
                # Window sizes
                if 'window' in tcp:
                    window_size = float(tcp['window'])
                    if event.get('src_ip') == src_ip:
                        features['Init_Win_bytes_forward'] = max(
                            features['Init_Win_bytes_forward'], 
                            window_size
                        )
                    else:
                        features['Init_Win_bytes_backward'] = max(
                            features['Init_Win_bytes_backward'], 
                            window_size
                        )
            
            # Timestamp for duration calculation
            if 'timestamp' in event:
                ts = dateutil.parser.parse(event['timestamp'])
                if ts:
                    timestamps.append(ts)
            
            # Packet length statistics
            if 'payload_printable' in event:
                length = len(event['payload_printable'])
                packet_lengths.append(length)
                if event.get('src_ip') == src_ip:
                    fwd_packet_lengths.append(length)
                else:
                    bwd_packet_lengths.append(length)
        
        # Calculate time-based features
        if timestamps:
            timestamps.sort()
            duration = (timestamps[-1] - timestamps[0]).total_seconds()
            features['Flow Duration'] = duration
            
            if duration > 0:
                features['Flow Bytes/s'] = (total_bytes_fwd + total_bytes_bwd) / duration
                features['Flow Packets/s'] = (total_packets_fwd + total_packets_bwd) / duration
            
            # Inter-arrival times
            if len(timestamps) > 1:
                iats = [(timestamps[i+1] - timestamps[i]).total_seconds() 
                       for i in range(len(timestamps)-1)]
                features['Flow IAT Mean'] = np.mean(iats)
                features['Flow IAT Std'] = np.std(iats)
                features['Flow IAT Max'] = np.max(iats)
                features['Flow IAT Min'] = np.min(iats)
        
        # Packet statistics
        features['Total Fwd Packets'] = total_packets_fwd
        features['Total Backward Packets'] = total_packets_bwd
        features['Total Length of Fwd Packets'] = total_bytes_fwd
        features['Total Length of Bwd Packets'] = total_bytes_bwd
        
        if packet_lengths:
            features['Packet Length Mean'] = np.mean(packet_lengths)
            features['Packet Length Std'] = np.std(packet_lengths)
            features['Packet Length Variance'] = np.var(packet_lengths)
        
        if fwd_packet_lengths:
            features['Fwd Packet Length Max'] = max(fwd_packet_lengths)
            features['Fwd Packet Length Min'] = min(fwd_packet_lengths)
        
        if bwd_packet_lengths:
            features['Bwd Packet Length Max'] = max(bwd_packet_lengths)
            features['Bwd Packet Length Min'] = min(bwd_packet_lengths)
        
    except Exception as e:
        print(f"Error extracting features: {str(e)}")
    
    return features

def process_flows(flows):
    """Process all flows and extract features."""
    features_list = []
    processed = 0
    total_flows = len(flows)
    
    print("Processing flows...")
    for flow_id, events in flows.items():
        if processed % 10000 == 0:
            print(f"Processed {processed}/{total_flows} flows...")
        
        features = extract_flow_features(events)
        features['flow_id'] = flow_id
        features_list.append(features)
        processed += 1
    
    return pd.DataFrame(features_list)

def main():
    # Load models
    print("Loading models...")
    try:
        models = {
            'xgboost': xgb.Booster(model_file='xgboost_model3.json'),
            'catboost': CatBoostClassifier().load_model('catboost_model3.cbm'),
            'random_forest': joblib.load('random_forest_model3.pkl'),
            'scaler': joblib.load('scaler_3.joblib'),
            'label_encoder': joblib.load('label_encoder_3.joblib')
        }
    except Exception as e:
        print(f"Error loading models: {str(e)}")
        return

    # Load and process events
    flows = load_and_normalize_events('eve.json')
    
    # Extract features
    features_df = process_flows(flows)
    
    # Prepare features for prediction
    flow_ids = features_df['flow_id']
    features_df = features_df.drop('flow_id', axis=1)
    
    # Scale features
    print("\nScaling features...")
    scaled_features = models['scaler'].transform(features_df)
    
    # Make predictions
    print("Making predictions...")
    predictions = {}
    
    # XGBoost
    dmatrix = xgb.DMatrix(scaled_features, feature_names=features_df.columns.tolist())
    xgb_pred_prob = models['xgboost'].predict(dmatrix)
    predictions['XGBoost'] = models['label_encoder'].inverse_transform(
        np.argmax(xgb_pred_prob, axis=1)
    )
    predictions['XGBoost_confidence'] = np.max(xgb_pred_prob, axis=1)
    
    # CatBoost
    cat_pred_prob = models['catboost'].predict_proba(scaled_features)
    predictions['CatBoost'] = models['label_encoder'].inverse_transform(
        np.argmax(cat_pred_prob, axis=1)
    )
    predictions['CatBoost_confidence'] = np.max(cat_pred_prob, axis=1)
    
    # Random Forest
    rf_pred_prob = models['random_forest'].predict_proba(scaled_features)
    predictions['RandomForest'] = models['label_encoder'].inverse_transform(
        np.argmax(rf_pred_prob, axis=1)
    )
    predictions['RandomForest_confidence'] = np.max(rf_pred_prob, axis=1)
    
    # Combine results
    results = pd.DataFrame(predictions) 
    results['flow_id'] = flow_ids
    results['consensus'] = results[['XGBoost', 'CatBoost', 'RandomForest']].mode(axis=1)[0]
    results['avg_confidence'] = results[
        ['XGBoost_confidence', 'CatBoost_confidence', 'RandomForest_confidence']
    ].mean(axis=1)
    
    # Save results
    results.to_csv('predictions_detailed.csv', index=False)
    print("\nResults saved to predictions_detailed.csv")
    
    # Print summary
    print("\nPrediction Summary by Model:")
    for model in ['XGBoost', 'CatBoost', 'RandomForest', 'consensus']:
        print(f"\n{model} predictions:")
        counts = results[model].value_counts()
        percentages = results[model].value_counts(normalize=True) * 100
        for label, count in counts.items():
            print(f"{label}: {count} ({percentages[label]:.2f}%)")
    
    # Print high confidence predictions
    print("\nHigh Confidence Predictions (>0.9):")
    high_conf = results[results['avg_confidence'] > 0.9]
    print(high_conf['consensus'].value_counts())
    
    return results, features_df

if __name__ == "__main__":
    main()

I tried to test them in this way. I simply copied the data from suricata, which is the identical eve.json file that suricata creates.