Source code for RADAR.static_data.static_datasets_uci

from ucimlrepo import fetch_ucirepo
from io import BytesIO
import requests
import pandas as pd
import zipfile
import numpy as np

"""
Datasets used in anomaly detection (Source: UCI Machine Learning Repository)
Source: https://archive.ics.uci.edu/ml/index.php

Datasets used:
 1. "shuttle": Dataset on event classifications in a space shuttle, useful for identifying anomalies in control systems.
 2. "kddcup99": Dataset with network traffic logs, used to detect computer attacks and anomalies in security systems.
 3. "spambase": Dataset of e-mail labeled as spam or non-spam, ideal for detecting anomalies in the classification of messages.
 4. "mammographic_mass": Data on mammary tumors, used to identify anomalies in the classification of benign or malignant mammary masses.
 5. "arrhythmia": Electrocardiogram data, useful for detecting heart rhythm abnormalities.
 6. "default_of_credit_card_clients": Dataset containing financial information, used to detect clients with high risk of non-payment.
 7. "Wine Quality": Dataset to detect anomalies in the quality of wines according to various chemical characteristics.
 8. "Detection of IoT Botnet Attacks (N-BaIoT)": Used to detect botnet attacks on IoT devices, focused on identifying anomalous patterns in networks.
 9. "Human Activity Recognition Using Smartphones": A dataset that measures human activities using smartphone sensors, useful for detecting anomalies in human behavior.
"""



[docs]
def global_load(name_dataset):
    """
    Loads a dataset using the corresponding loading method and parameters.

    Parameters:
    name_dataset (str): The name of the dataset to be loaded.

    Returns:
    The dataset loaded using the corresponding method.
    """
    method_load = datasets[name_dataset][0]
    kwargs = datasets[name_dataset][1]
    return method_load(**kwargs)



# fetch dataset

[docs]
def load_from_id(id):
    """
    Fetches a dataset from the UCI repository using its ID.

    Parameters:
        id (int): The identifier of the dataset in the UCI repository.

    Returns:
        tuple: A tuple containing:
            - X (pd.DataFrame): The feature matrix.
            - y (pd.Series or np.array): The target variable.
    """
    dataset = fetch_ucirepo(id=id)
    X = dataset.data.features
    y = dataset.data.targets

    print("Metadata:", dataset.metadata)
    # variable information
    print("Variable information:", dataset.variables)
    return np.array(X), np.array(y)




[docs]
def load_from_url(url, **kwargs):
    """
    Loads a dataset from a given URL.

    Parameters:
        url (str): The URL from which to fetch the dataset.
        **kwargs: Additional arguments to be passed to `pd.read_csv()`.

    Returns:
        pd.DataFrame: The dataset loaded from the URL.
    """

    data = requests.get(url).content
    dataset = pd.read_csv(BytesIO(data), **kwargs)
    return dataset




[docs]
def load_arrhythmia(url, **kwargs):
    """Load the Arrhythmia dataset and split features/target.

    The raw UCI file stores the class label in the last column and uses '?'
    for missing values.
    """

    dataset = load_from_url(url, **kwargs)
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    return np.array(X), np.array(y)




[docs]
def load_human_activity_recognition(url, **kwargs):
    response = requests.get(url)

    # Descomprimir el archivo ZIP
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        # Listar los archivos dentro del ZIP
        z.printdir()

        # Extraer los archivos necesarios (X_train, X_test, y_train, y_test)
        z.extract("UCI HAR Dataset/train/X_train.txt", "UCI_HAR")
        z.extract("UCI HAR Dataset/test/X_test.txt", "UCI_HAR")
        z.extract("UCI HAR Dataset/train/y_train.txt", "UCI_HAR")
        z.extract("UCI HAR Dataset/test/y_test.txt", "UCI_HAR")

    # Leer los archivos extraídos con pandas
    X_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/X_train.txt", **kwargs)
    X_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/X_test.txt", **kwargs)

    y_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/y_train.txt", **kwargs)
    y_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/y_test.txt", **kwargs)

    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)




[docs]
def load_kddcup99(**kwargs):
    """Reads the KDD Cup 99 dataset using sklearn's built-in fetcher.

    The original kdd.ics.uci.edu URLs are no longer available, so we rely on
    sklearn which handles mirror selection and local caching automatically.

    Uses the 10 % subset (percent10=True) and then takes a stratified sample
    of ~10 000 rows so the frontend stays responsive.
    """
    from sklearn.datasets import fetch_kddcup99

    print("Downloading KDD Cup 99 dataset (10% subset) via sklearn...")
    bunch = fetch_kddcup99(as_frame=True, percent10=True)

    data = bunch.data  # features DataFrame
    target = bunch.target.astype(str).str.rstrip(".")

    # --- Stratified sub-sample to keep the frontend responsive ---------------
    MAX_ROWS = 20_000
    if len(data) > MAX_ROWS:
        from sklearn.model_selection import train_test_split
        _, data, _, target = train_test_split(
            data, target,
            test_size=MAX_ROWS / len(data),
            stratify=target,
            random_state=42,
        )
        data = data.reset_index(drop=True)
        target = target.reset_index(drop=True)
    # -------------------------------------------------------------------------

    # Decode any byte-string columns and one-hot encode categoricals so the
    # downstream pipeline always receives a fully numeric DataFrame.
    for col in data.columns:
        if data[col].dtype == object:
            data[col] = data[col].apply(
                lambda v: v.decode() if isinstance(v, bytes) else v
            )
    cat_cols = data.select_dtypes(include=["object", "category"]).columns.tolist()
    if cat_cols:
        data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

    attack_class = target.values
    attack_types = sorted(set(attack_class))

    return data, attack_types, attack_class




datasets = {
    "shuttle": [load_from_id, {"id": 148}],
    "kddcup99": [load_kddcup99, {}],
    "spambase": [load_from_id, {"id": 94}],
    "mammographic_mass": [load_from_id, {"id": 161}],
    "arrhythmia": [
        load_arrhythmia,
        {
            "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data",
            "header": None,
            "na_values": "?",
        },
    ],
    "default_of_credit_card_clients": [load_from_id, {"id": 350}],
    "detection_of_IoT_botnet_attacks_N_BaIoT": [
        load_from_url,
        {
            "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00442/Philips_B120N10_Baby_Monitor/benign_traffic.csv",
        },
    ],
    "human_activity_recognition": [
        load_human_activity_recognition,
        {
            "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip",
            "header": None,
            "delim_whitespace": True,
        },
    ],
}