from ucimlrepo import fetch_ucirepo
from io import BytesIO
import requests
import pandas as pd
import zipfile
import numpy as np
"""
Datasets used in anomaly detection (Source: UCI Machine Learning Repository)
Source: https://archive.ics.uci.edu/ml/index.php
Datasets used:
1. "shuttle": Dataset on event classifications in a space shuttle, useful for identifying anomalies in control systems.
2. "kddcup99": Dataset with network traffic logs, used to detect computer attacks and anomalies in security systems.
3. "spambase": Dataset of e-mail labeled as spam or non-spam, ideal for detecting anomalies in the classification of messages.
4. "mammographic_mass": Data on mammary tumors, used to identify anomalies in the classification of benign or malignant mammary masses.
5. "arrhythmia": Electrocardiogram data, useful for detecting heart rhythm abnormalities.
6. "default_of_credit_card_clients": Dataset containing financial information, used to detect clients with high risk of non-payment.
7. "Wine Quality": Dataset to detect anomalies in the quality of wines according to various chemical characteristics.
8. "Detection of IoT Botnet Attacks (N-BaIoT)": Used to detect botnet attacks on IoT devices, focused on identifying anomalous patterns in networks.
9. "Human Activity Recognition Using Smartphones": A dataset that measures human activities using smartphone sensors, useful for detecting anomalies in human behavior.
"""
[docs]
def global_load(name_dataset):
"""
Loads a dataset using the corresponding loading method and parameters.
Parameters:
name_dataset (str): The name of the dataset to be loaded.
Returns:
The dataset loaded using the corresponding method.
"""
method_load = datasets[name_dataset][0]
kwargs = datasets[name_dataset][1]
return method_load(**kwargs)
# fetch dataset
[docs]
def load_from_id(id):
"""
Fetches a dataset from the UCI repository using its ID.
Parameters:
id (int): The identifier of the dataset in the UCI repository.
Returns:
tuple: A tuple containing:
- X (pd.DataFrame): The feature matrix.
- y (pd.Series or np.array): The target variable.
"""
dataset = fetch_ucirepo(id=id)
X = dataset.data.features
y = dataset.data.targets
print("Metadata:", dataset.metadata)
# variable information
print("Variable information:", dataset.variables)
return np.array(X), np.array(y)
[docs]
def load_from_url(url, **kwargs):
"""
Loads a dataset from a given URL.
Parameters:
url (str): The URL from which to fetch the dataset.
**kwargs: Additional arguments to be passed to `pd.read_csv()`.
Returns:
pd.DataFrame: The dataset loaded from the URL.
"""
data = requests.get(url).content
dataset = pd.read_csv(BytesIO(data), **kwargs)
return dataset
[docs]
def load_arrhythmia(url, **kwargs):
"""Load the Arrhythmia dataset and split features/target.
The raw UCI file stores the class label in the last column and uses '?'
for missing values.
"""
dataset = load_from_url(url, **kwargs)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
return np.array(X), np.array(y)
[docs]
def load_human_activity_recognition(url, **kwargs):
response = requests.get(url)
# Descomprimir el archivo ZIP
with zipfile.ZipFile(BytesIO(response.content)) as z:
# Listar los archivos dentro del ZIP
z.printdir()
# Extraer los archivos necesarios (X_train, X_test, y_train, y_test)
z.extract("UCI HAR Dataset/train/X_train.txt", "UCI_HAR")
z.extract("UCI HAR Dataset/test/X_test.txt", "UCI_HAR")
z.extract("UCI HAR Dataset/train/y_train.txt", "UCI_HAR")
z.extract("UCI HAR Dataset/test/y_test.txt", "UCI_HAR")
# Leer los archivos extraídos con pandas
X_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/X_train.txt", **kwargs)
X_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/X_test.txt", **kwargs)
y_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/y_train.txt", **kwargs)
y_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/y_test.txt", **kwargs)
return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
[docs]
def load_kddcup99(**kwargs):
"""Reads the KDD Cup 99 dataset using sklearn's built-in fetcher.
The original kdd.ics.uci.edu URLs are no longer available, so we rely on
sklearn which handles mirror selection and local caching automatically.
Uses the 10 % subset (percent10=True) and then takes a stratified sample
of ~10 000 rows so the frontend stays responsive.
"""
from sklearn.datasets import fetch_kddcup99
print("Downloading KDD Cup 99 dataset (10% subset) via sklearn...")
bunch = fetch_kddcup99(as_frame=True, percent10=True)
data = bunch.data # features DataFrame
target = bunch.target.astype(str).str.rstrip(".")
# --- Stratified sub-sample to keep the frontend responsive ---------------
MAX_ROWS = 20_000
if len(data) > MAX_ROWS:
from sklearn.model_selection import train_test_split
_, data, _, target = train_test_split(
data, target,
test_size=MAX_ROWS / len(data),
stratify=target,
random_state=42,
)
data = data.reset_index(drop=True)
target = target.reset_index(drop=True)
# -------------------------------------------------------------------------
# Decode any byte-string columns and one-hot encode categoricals so the
# downstream pipeline always receives a fully numeric DataFrame.
for col in data.columns:
if data[col].dtype == object:
data[col] = data[col].apply(
lambda v: v.decode() if isinstance(v, bytes) else v
)
cat_cols = data.select_dtypes(include=["object", "category"]).columns.tolist()
if cat_cols:
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)
attack_class = target.values
attack_types = sorted(set(attack_class))
return data, attack_types, attack_class
datasets = {
"shuttle": [load_from_id, {"id": 148}],
"kddcup99": [load_kddcup99, {}],
"spambase": [load_from_id, {"id": 94}],
"mammographic_mass": [load_from_id, {"id": 161}],
"arrhythmia": [
load_arrhythmia,
{
"url": "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data",
"header": None,
"na_values": "?",
},
],
"default_of_credit_card_clients": [load_from_id, {"id": 350}],
"detection_of_IoT_botnet_attacks_N_BaIoT": [
load_from_url,
{
"url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00442/Philips_B120N10_Baby_Monitor/benign_traffic.csv",
},
],
"human_activity_recognition": [
load_human_activity_recognition,
{
"url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip",
"header": None,
"delim_whitespace": True,
},
],
}