import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Load the dataset
data = {
'Timestamp': ['2024-02-20 08:00:00', '2024-02-20 08:01:00', '2024-02-20 08:02:00', '2024-02-20 08:03:00', '2024-02-20 08:04:00'],
'Source IP': ['192.168.1.10', '192.168.1.15', '192.168.1.20', '192.168.1.25', '192.168.1.30'],
'Destination IP': ['54.239.26.214', '52.45.129.123', '203.0.113.12', '185.87.120.10', '54.239.26.214'],
'Protocol': ['TCP', 'UDP', 'TCP', 'TCP', 'UDP'],
'Port': [443, 123, 80, 22, 161],
'Bytes Transferred': [1024, 512, 2048, 4096, 8192],
'Packets Transferred': [10, 5, 15, 20, 25],
'Label': ['Normal', 'Normal', 'Malicious', 'Malicious', 'Normal']
}
df = pd.DataFrame(data)
# Data preprocessing
# 1. Convert 'Timestamp' to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
# 2. Encode categorical variables ('Protocol' and 'Label')
protocol_encoder = LabelEncoder()
df['Protocol'] = protocol_encoder.fit_transform(df['Protocol'])
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])
# 3. Scale numerical features ('Bytes Transferred' and 'Packets Transferred')
scaler = StandardScaler()
df[['Bytes Transferred', 'Packets Transferred']] = scaler.fit_transform(df[['Bytes Transferred', 'Packets Transferred']])
print(df)