{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd #Dataframe, Series\n", "import numpy as np #Paquetes de Scientific computing, Arrays\n", "from matplotlib import pyplot as plt #Graficos\n", "\n", "from sklearn.model_selection import train_test_split #Dividir Dataset en train y test\n", "from sklearn.preprocessing import LabelEncoder #Pasar datos categoricos a numericos\n", "from sklearn import preprocessing #Normalizacion de datos\n", "\n", "#Imports dibujo arbol de decision\n", "import graphviz\n", "import StringIO as io\n", "import pydotplus\n", "import imageio\n", "\n", "import time #Medir tiempo de entrenamiento\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset de conexiones KDD'99\n", "Dataset: kddcup.data.corrected (Completo)\n", "- Importacion del Dataset\n", "- Ingenieria de atributos\n", "- Visualización de datos y gráfico de 3 de los principales atributos\n", "- Entrenamiento de un clasificador (Arbol de decision)\n", "- Predecir objetivo usando el clasificador entrenado\n", "- Comparacion de resultados con progresion lineal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "IMPORTACION DEL DATASET Y DESCRIPCION" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "col_names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n", " \"dst_bytes\",\"land\",\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\n", " \"logged_in\",\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\n", " \"num_file_creations\",\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\n", " \"is_host_login\",\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n", " \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n", " \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n", " \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", " \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", " \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"label\"]\n", "\n", "data = pd.read_csv('../dataset/kddcup.data.corrected', header=None, names = col_names)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | duration | \n", "src_bytes | \n", "dst_bytes | \n", "land | \n", "wrong_fragment | \n", "urgent | \n", "hot | \n", "num_failed_logins | \n", "logged_in | \n", "num_compromised | \n", "... | \n", "dst_host_count | \n", "dst_host_srv_count | \n", "dst_host_same_srv_rate | \n", "dst_host_diff_srv_rate | \n", "dst_host_same_src_port_rate | \n", "dst_host_srv_diff_host_rate | \n", "dst_host_serror_rate | \n", "dst_host_srv_serror_rate | \n", "dst_host_rerror_rate | \n", "dst_host_srv_rerror_rate | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "... | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "4.898431e+06 | \n", "
mean | \n", "4.834243e+01 | \n", "1.834621e+03 | \n", "1.093623e+03 | \n", "5.716116e-06 | \n", "6.487792e-04 | \n", "7.961733e-06 | \n", "1.243766e-02 | \n", "3.205108e-05 | \n", "1.435290e-01 | \n", "8.088304e-03 | \n", "... | \n", "2.329811e+02 | \n", "1.892142e+02 | \n", "7.537132e-01 | \n", "3.071111e-02 | \n", "6.050520e-01 | \n", "6.464107e-03 | \n", "1.780911e-01 | \n", "1.778859e-01 | \n", "5.792780e-02 | \n", "5.765941e-02 | \n", "
std | \n", "7.233298e+02 | \n", "9.414311e+05 | \n", "6.450123e+05 | \n", "2.390833e-03 | \n", "4.285434e-02 | \n", "7.215084e-03 | \n", "4.689782e-01 | \n", "7.299408e-03 | \n", "3.506116e-01 | \n", "3.856481e+00 | \n", "... | \n", "6.402094e+01 | \n", "1.059128e+02 | \n", "4.111860e-01 | \n", "1.085432e-01 | \n", "4.809877e-01 | \n", "4.125978e-02 | \n", "3.818382e-01 | \n", "3.821774e-01 | \n", "2.309428e-01 | \n", "2.309777e-01 | \n", "
min | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "... | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "
25% | \n", "0.000000e+00 | \n", "4.500000e+01 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "... | \n", "2.550000e+02 | \n", "4.900000e+01 | \n", "4.100000e-01 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "
50% | \n", "0.000000e+00 | \n", "5.200000e+02 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "... | \n", "2.550000e+02 | \n", "2.550000e+02 | \n", "1.000000e+00 | \n", "0.000000e+00 | \n", "1.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "
75% | \n", "0.000000e+00 | \n", "1.032000e+03 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "... | \n", "2.550000e+02 | \n", "2.550000e+02 | \n", "1.000000e+00 | \n", "4.000000e-02 | \n", "1.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "
max | \n", "5.832900e+04 | \n", "1.379964e+09 | \n", "1.309937e+09 | \n", "1.000000e+00 | \n", "3.000000e+00 | \n", "1.400000e+01 | \n", "7.700000e+01 | \n", "5.000000e+00 | \n", "1.000000e+00 | \n", "7.479000e+03 | \n", "... | \n", "2.550000e+02 | \n", "2.550000e+02 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "1.000000e+00 | \n", "
8 rows × 38 columns
\n", "\n", " | duration | \n", "protocol_type | \n", "service | \n", "flag | \n", "src_bytes | \n", "dst_bytes | \n", "land | \n", "wrong_fragment | \n", "urgent | \n", "hot | \n", "... | \n", "dst_host_srv_count | \n", "dst_host_same_srv_rate | \n", "dst_host_diff_srv_rate | \n", "dst_host_same_src_port_rate | \n", "dst_host_srv_diff_host_rate | \n", "dst_host_serror_rate | \n", "dst_host_srv_serror_rate | \n", "dst_host_rerror_rate | \n", "dst_host_srv_rerror_rate | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "215 | \n", "45076 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "normal. | \n", "
1 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "162 | \n", "4528 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.00 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "normal. | \n", "
2 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "236 | \n", "1228 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "2 | \n", "1.0 | \n", "0.0 | \n", "0.50 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "normal. | \n", "
3 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "233 | \n", "2032 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "3 | \n", "1.0 | \n", "0.0 | \n", "0.33 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "normal. | \n", "
4 | \n", "0 | \n", "tcp | \n", "http | \n", "SF | \n", "239 | \n", "486 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "4 | \n", "1.0 | \n", "0.0 | \n", "0.25 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "normal. | \n", "
5 rows × 42 columns
\n", "