Source code for RADAR.static_data.static_datasets_uci

from ucimlrepo import fetch_ucirepo
from io import BytesIO
import requests
import pandas as pd
import zipfile
import numpy as np

"""
Datasets used in anomaly detection (Source: UCI Machine Learning Repository)
Source: https://archive.ics.uci.edu/ml/index.php

Datasets used:
 1. "shuttle": Dataset on event classifications in a space shuttle, useful for identifying anomalies in control systems.
 2. "kddcup99": Dataset with network traffic logs, used to detect computer attacks and anomalies in security systems.
 3. "spambase": Dataset of e-mail labeled as spam or non-spam, ideal for detecting anomalies in the classification of messages.
 4. "mammographic_mass": Data on mammary tumors, used to identify anomalies in the classification of benign or malignant mammary masses.
 5. "arrhythmia": Electrocardiogram data, useful for detecting heart rhythm abnormalities.
 6. "default_of_credit_card_clients": Dataset containing financial information, used to detect clients with high risk of non-payment.
 7. "Wine Quality": Dataset to detect anomalies in the quality of wines according to various chemical characteristics.
 8. "Detection of IoT Botnet Attacks (N-BaIoT)": Used to detect botnet attacks on IoT devices, focused on identifying anomalous patterns in networks.
 9. "Human Activity Recognition Using Smartphones": A dataset that measures human activities using smartphone sensors, useful for detecting anomalies in human behavior.
"""


[docs] def global_load(name_dataset): """ Loads a dataset using the corresponding loading method and parameters. Parameters: name_dataset (str): The name of the dataset to be loaded. Returns: The dataset loaded using the corresponding method. """ method_load = datasets[name_dataset][0] kwargs = datasets[name_dataset][1] return method_load(**kwargs)
# fetch dataset
[docs] def load_from_id(id): """ Fetches a dataset from the UCI repository using its ID. Parameters: id (int): The identifier of the dataset in the UCI repository. Returns: tuple: A tuple containing: - X (pd.DataFrame): The feature matrix. - y (pd.Series or np.array): The target variable. """ dataset = fetch_ucirepo(id=id) X = dataset.data.features y = dataset.data.targets print("Metadata:", dataset.metadata) # variable information print("Variable information:", dataset.variables) return np.array(X), np.array(y)
[docs] def load_from_url(url, **kwargs): """ Loads a dataset from a given URL. Parameters: url (str): The URL from which to fetch the dataset. **kwargs: Additional arguments to be passed to `pd.read_csv()`. Returns: pd.DataFrame: The dataset loaded from the URL. """ data = requests.get(url).content dataset = pd.read_csv(BytesIO(data), **kwargs) return dataset
[docs] def load_arrhythmia(url, **kwargs): """Load the Arrhythmia dataset and split features/target. The raw UCI file stores the class label in the last column and uses '?' for missing values. """ dataset = load_from_url(url, **kwargs) X = dataset.iloc[:, :-1] y = dataset.iloc[:, -1] return np.array(X), np.array(y)
[docs] def load_human_activity_recognition(url, **kwargs): response = requests.get(url) # Descomprimir el archivo ZIP with zipfile.ZipFile(BytesIO(response.content)) as z: # Listar los archivos dentro del ZIP z.printdir() # Extraer los archivos necesarios (X_train, X_test, y_train, y_test) z.extract("UCI HAR Dataset/train/X_train.txt", "UCI_HAR") z.extract("UCI HAR Dataset/test/X_test.txt", "UCI_HAR") z.extract("UCI HAR Dataset/train/y_train.txt", "UCI_HAR") z.extract("UCI HAR Dataset/test/y_test.txt", "UCI_HAR") # Leer los archivos extraídos con pandas X_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/X_train.txt", **kwargs) X_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/X_test.txt", **kwargs) y_train = pd.read_csv("UCI_HAR/UCI HAR Dataset/train/y_train.txt", **kwargs) y_test = pd.read_csv("UCI_HAR/UCI HAR Dataset/test/y_test.txt", **kwargs) return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
[docs] def load_kddcup99(**kwargs): """Reads the KDD Cup 99 dataset using sklearn's built-in fetcher. The original kdd.ics.uci.edu URLs are no longer available, so we rely on sklearn which handles mirror selection and local caching automatically. Uses the 10 % subset (percent10=True) and then takes a stratified sample of ~10 000 rows so the frontend stays responsive. """ from sklearn.datasets import fetch_kddcup99 print("Downloading KDD Cup 99 dataset (10% subset) via sklearn...") bunch = fetch_kddcup99(as_frame=True, percent10=True) data = bunch.data # features DataFrame target = bunch.target.astype(str).str.rstrip(".") # --- Stratified sub-sample to keep the frontend responsive --------------- MAX_ROWS = 20_000 if len(data) > MAX_ROWS: from sklearn.model_selection import train_test_split _, data, _, target = train_test_split( data, target, test_size=MAX_ROWS / len(data), stratify=target, random_state=42, ) data = data.reset_index(drop=True) target = target.reset_index(drop=True) # ------------------------------------------------------------------------- # Decode any byte-string columns and one-hot encode categoricals so the # downstream pipeline always receives a fully numeric DataFrame. for col in data.columns: if data[col].dtype == object: data[col] = data[col].apply( lambda v: v.decode() if isinstance(v, bytes) else v ) cat_cols = data.select_dtypes(include=["object", "category"]).columns.tolist() if cat_cols: data = pd.get_dummies(data, columns=cat_cols, drop_first=True) attack_class = target.values attack_types = sorted(set(attack_class)) return data, attack_types, attack_class
datasets = { "shuttle": [load_from_id, {"id": 148}], "kddcup99": [load_kddcup99, {}], "spambase": [load_from_id, {"id": 94}], "mammographic_mass": [load_from_id, {"id": 161}], "arrhythmia": [ load_arrhythmia, { "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data", "header": None, "na_values": "?", }, ], "default_of_credit_card_clients": [load_from_id, {"id": 350}], "detection_of_IoT_botnet_attacks_N_BaIoT": [ load_from_url, { "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00442/Philips_B120N10_Baby_Monitor/benign_traffic.csv", }, ], "human_activity_recognition": [ load_human_activity_recognition, { "url": "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip", "header": None, "delim_whitespace": True, }, ], }