{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd #Dataframe, Series\n", "import numpy as np #Paquetes de Scientific computing, Arrays\n", "from matplotlib import pyplot as plt #Graficos\n", "\n", "from sklearn.model_selection import train_test_split #Dividir Dataset en train y test\n", "from sklearn.preprocessing import LabelEncoder #Pasar datos categoricos a numericos\n", "from sklearn import preprocessing #Normalizacion de datos\n", "\n", "#Imports dibujo arbol de decision\n", "import graphviz\n", "import StringIO as io\n", "import pydotplus\n", "import imageio\n", "\n", "import time #Medir tiempo de entrenamiento\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset de conexiones KDD'99\n", "Dataset: kddcup.data.corrected (Completo)\n", "- Importacion del Dataset\n", "- Ingenieria de atributos\n", "- Visualización de datos y gráfico de 3 de los principales atributos\n", "- Entrenamiento de un clasificador (Arbol de decision)\n", "- Predecir objetivo usando el clasificador entrenado\n", "- Comparacion de resultados con progresion lineal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "IMPORTACION DEL DATASET Y DESCRIPCION" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "col_names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n", " \"dst_bytes\",\"land\",\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\n", " \"logged_in\",\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\n", " \"num_file_creations\",\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\n", " \"is_host_login\",\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n", " \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n", " \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n", " \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", " \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", " \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"label\"]\n", "\n", "data = pd.read_csv('../dataset/kddcup.data.corrected', header=None, names = col_names)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationsrc_bytesdst_byteslandwrong_fragmenturgenthotnum_failed_loginslogged_innum_compromised...dst_host_countdst_host_srv_countdst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_rate
count4.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+06...4.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+064.898431e+06
mean4.834243e+011.834621e+031.093623e+035.716116e-066.487792e-047.961733e-061.243766e-023.205108e-051.435290e-018.088304e-03...2.329811e+021.892142e+027.537132e-013.071111e-026.050520e-016.464107e-031.780911e-011.778859e-015.792780e-025.765941e-02
std7.233298e+029.414311e+056.450123e+052.390833e-034.285434e-027.215084e-034.689782e-017.299408e-033.506116e-013.856481e+00...6.402094e+011.059128e+024.111860e-011.085432e-014.809877e-014.125978e-023.818382e-013.821774e-012.309428e-012.309777e-01
min0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
25%0.000000e+004.500000e+010.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...2.550000e+024.900000e+014.100000e-010.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
50%0.000000e+005.200000e+020.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...2.550000e+022.550000e+021.000000e+000.000000e+001.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
75%0.000000e+001.032000e+030.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...2.550000e+022.550000e+021.000000e+004.000000e-021.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
max5.832900e+041.379964e+091.309937e+091.000000e+003.000000e+001.400000e+017.700000e+015.000000e+001.000000e+007.479000e+03...2.550000e+022.550000e+021.000000e+001.000000e+001.000000e+001.000000e+001.000000e+001.000000e+001.000000e+001.000000e+00
\n", "

8 rows × 38 columns

\n", "
" ], "text/plain": [ " duration src_bytes dst_bytes land wrong_fragment \\\n", "count 4.898431e+06 4.898431e+06 4.898431e+06 4.898431e+06 4.898431e+06 \n", "mean 4.834243e+01 1.834621e+03 1.093623e+03 5.716116e-06 6.487792e-04 \n", "std 7.233298e+02 9.414311e+05 6.450123e+05 2.390833e-03 4.285434e-02 \n", "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", "25% 0.000000e+00 4.500000e+01 0.000000e+00 0.000000e+00 0.000000e+00 \n", "50% 0.000000e+00 5.200000e+02 0.000000e+00 0.000000e+00 0.000000e+00 \n", "75% 0.000000e+00 1.032000e+03 0.000000e+00 0.000000e+00 0.000000e+00 \n", "max 5.832900e+04 1.379964e+09 1.309937e+09 1.000000e+00 3.000000e+00 \n", "\n", " urgent hot num_failed_logins logged_in \\\n", "count 4.898431e+06 4.898431e+06 4.898431e+06 4.898431e+06 \n", "mean 7.961733e-06 1.243766e-02 3.205108e-05 1.435290e-01 \n", "std 7.215084e-03 4.689782e-01 7.299408e-03 3.506116e-01 \n", "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", "50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", "75% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", "max 1.400000e+01 7.700000e+01 5.000000e+00 1.000000e+00 \n", "\n", " num_compromised ... dst_host_count \\\n", "count 4.898431e+06 ... 4.898431e+06 \n", "mean 8.088304e-03 ... 2.329811e+02 \n", "std 3.856481e+00 ... 6.402094e+01 \n", "min 0.000000e+00 ... 0.000000e+00 \n", "25% 0.000000e+00 ... 2.550000e+02 \n", "50% 0.000000e+00 ... 2.550000e+02 \n", "75% 0.000000e+00 ... 2.550000e+02 \n", "max 7.479000e+03 ... 2.550000e+02 \n", "\n", " dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate \\\n", "count 4.898431e+06 4.898431e+06 4.898431e+06 \n", "mean 1.892142e+02 7.537132e-01 3.071111e-02 \n", "std 1.059128e+02 4.111860e-01 1.085432e-01 \n", "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", "25% 4.900000e+01 4.100000e-01 0.000000e+00 \n", "50% 2.550000e+02 1.000000e+00 0.000000e+00 \n", "75% 2.550000e+02 1.000000e+00 4.000000e-02 \n", "max 2.550000e+02 1.000000e+00 1.000000e+00 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "count 4.898431e+06 4.898431e+06 \n", "mean 6.050520e-01 6.464107e-03 \n", "std 4.809877e-01 4.125978e-02 \n", "min 0.000000e+00 0.000000e+00 \n", "25% 0.000000e+00 0.000000e+00 \n", "50% 1.000000e+00 0.000000e+00 \n", "75% 1.000000e+00 0.000000e+00 \n", "max 1.000000e+00 1.000000e+00 \n", "\n", " dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate \\\n", "count 4.898431e+06 4.898431e+06 4.898431e+06 \n", "mean 1.780911e-01 1.778859e-01 5.792780e-02 \n", "std 3.818382e-01 3.821774e-01 2.309428e-01 \n", "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", "25% 0.000000e+00 0.000000e+00 0.000000e+00 \n", "50% 0.000000e+00 0.000000e+00 0.000000e+00 \n", "75% 0.000000e+00 0.000000e+00 0.000000e+00 \n", "max 1.000000e+00 1.000000e+00 1.000000e+00 \n", "\n", " dst_host_srv_rerror_rate \n", "count 4.898431e+06 \n", "mean 5.765941e-02 \n", "std 2.309777e-01 \n", "min 0.000000e+00 \n", "25% 0.000000e+00 \n", "50% 0.000000e+00 \n", "75% 0.000000e+00 \n", "max 1.000000e+00 \n", "\n", "[8 rows x 38 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationprotocol_typeserviceflagsrc_bytesdst_byteslandwrong_fragmenturgenthot...dst_host_srv_countdst_host_same_srv_ratedst_host_diff_srv_ratedst_host_same_src_port_ratedst_host_srv_diff_host_ratedst_host_serror_ratedst_host_srv_serror_ratedst_host_rerror_ratedst_host_srv_rerror_ratelabel
00tcphttpSF215450760000...00.00.00.000.00.00.00.00.0normal.
10tcphttpSF16245280000...11.00.01.000.00.00.00.00.0normal.
20tcphttpSF23612280000...21.00.00.500.00.00.00.00.0normal.
30tcphttpSF23320320000...31.00.00.330.00.00.00.00.0normal.
40tcphttpSF2394860000...41.00.00.250.00.00.00.00.0normal.
\n", "

5 rows × 42 columns

\n", "
" ], "text/plain": [ " duration protocol_type service flag src_bytes dst_bytes land \\\n", "0 0 tcp http SF 215 45076 0 \n", "1 0 tcp http SF 162 4528 0 \n", "2 0 tcp http SF 236 1228 0 \n", "3 0 tcp http SF 233 2032 0 \n", "4 0 tcp http SF 239 486 0 \n", "\n", " wrong_fragment urgent hot ... dst_host_srv_count \\\n", "0 0 0 0 ... 0 \n", "1 0 0 0 ... 1 \n", "2 0 0 0 ... 2 \n", "3 0 0 0 ... 3 \n", "4 0 0 0 ... 4 \n", "\n", " dst_host_same_srv_rate dst_host_diff_srv_rate \\\n", "0 0.0 0.0 \n", "1 1.0 0.0 \n", "2 1.0 0.0 \n", "3 1.0 0.0 \n", "4 1.0 0.0 \n", "\n", " dst_host_same_src_port_rate dst_host_srv_diff_host_rate \\\n", "0 0.00 0.0 \n", "1 1.00 0.0 \n", "2 0.50 0.0 \n", "3 0.33 0.0 \n", "4 0.25 0.0 \n", "\n", " dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate \\\n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 \n", "\n", " dst_host_srv_rerror_rate label \n", "0 0.0 normal. \n", "1 0.0 normal. \n", "2 0.0 normal. \n", "3 0.0 normal. \n", "4 0.0 normal. \n", "\n", "[5 rows x 42 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4898431 entries, 0 to 4898430\n", "Data columns (total 42 columns):\n", "duration int64\n", "protocol_type object\n", "service object\n", "flag object\n", "src_bytes int64\n", "dst_bytes int64\n", "land int64\n", "wrong_fragment int64\n", "urgent int64\n", "hot int64\n", "num_failed_logins int64\n", "logged_in int64\n", "num_compromised int64\n", "root_shell int64\n", "su_attempted int64\n", "num_root int64\n", "num_file_creations int64\n", "num_shells int64\n", "num_access_files int64\n", "num_outbound_cmds int64\n", "is_host_login int64\n", "is_guest_login int64\n", "count int64\n", "srv_count int64\n", "serror_rate float64\n", "srv_serror_rate float64\n", "rerror_rate float64\n", "srv_rerror_rate float64\n", "same_srv_rate float64\n", "diff_srv_rate float64\n", "srv_diff_host_rate float64\n", "dst_host_count int64\n", "dst_host_srv_count int64\n", "dst_host_same_srv_rate float64\n", "dst_host_diff_srv_rate float64\n", "dst_host_same_src_port_rate float64\n", "dst_host_srv_diff_host_rate float64\n", "dst_host_serror_rate float64\n", "dst_host_srv_serror_rate float64\n", "dst_host_rerror_rate float64\n", "dst_host_srv_rerror_rate float64\n", "label object\n", "dtypes: float64(15), int64(23), object(4)\n", "memory usage: 1.5+ GB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histogramas (Normal / Ataque)\n", "- Ejemplo Histogramas (La totalidad se muestra en el Anexo 1 para cada tipo de ataque)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python2.7/dist-packages/matplotlib/cbook/deprecation.py:106: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance. In a future version, a new instance will always be created and returned. Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.\n", " warnings.warn(message, mplDeprecation, stacklevel=1)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pos_count = data[data['label'] == 'normal.']['count']\n", "neg_count = data[data['label'] != 'normal.']['count']\n", "\n", "pos_dst_host_same_src_port_rate = data[data['label'] == 'normal.']['dst_host_same_src_port_rate']\n", "neg_dst_host_same_src_port_rate = data[data['label'] != 'normal.']['dst_host_same_src_port_rate']\n", "\n", "pos_service = data[data['label'] == 'normal.']['service']\n", "neg_service = data[data['label'] != 'normal.']['service']\n", "\n", "fig = plt.figure(figsize=(20,200))\n", "\n", "#Count\n", "ax1 = fig.add_subplot(10,1,1)\n", "ax1.set_title(\"Count\")\n", "pos_count.hist(alpha = 0.7, bins = 30, label='positive')\n", "ax2 = fig.add_subplot(10,1,1)\n", "neg_count.hist(alpha = 0.7, bins = 30, label='negative')\n", "\n", "\n", "#Destination host same source port rate\n", "ax3 = fig.add_subplot(10,1,2)\n", "ax3.set_title(\"Destination host same source port rate\")\n", "pos_dst_host_same_src_port_rate.hist(alpha = 0.7, bins = 30, label='positive')\n", "ax4 = fig.add_subplot(10,1,2)\n", "neg_dst_host_same_src_port_rate.hist(alpha = 0.7, bins = 30, label='negative')\n", "\n", "\n", "#Service\n", "ax5 = fig.add_subplot(10,1,3)\n", "ax5.set_title(\"Service\")\n", "pos_service.hist(alpha = 0.7, bins = 30, label='positive')\n", "plt.setp(ax5.xaxis.get_majorticklabels(), rotation=45)\n", "ax6 = fig.add_subplot(10,1,3)\n", "plt.setp(ax6.xaxis.get_majorticklabels(), rotation=45)\n", "neg_service.hist(alpha = 0.7, bins = 30, label='negative')\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tratamiento de datos antes de entrenar el modelo\n", "- Llamar attack a todo lo que no sea \"normal\" (SUPRIMIDO)\n", "- Transformar atributos categóricos en numéricos usando sklearn.preprocessing import LabelEncoder\n", "- Escalado de los datos entre 0 y 1 (SUPRIMIDO)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#Llamar attack a los que no sean normales\n", "#data.loc[data['label']!='normal.','label'] = 'attack.'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#TRANSFORMAR ATRIBUTOS categoricos EN NUMERO mediante from sklearn.preprocessing import LabelEncoder\n", "number = LabelEncoder()\n", "\n", "data_labels = data.label\n", "\n", "data['protocol_type'] = number.fit_transform(data['protocol_type'].astype('str'))\n", "data['service'] = number.fit_transform(data['service'].astype('str'))\n", "data['flag'] = number.fit_transform(data['flag'].astype('str'))\n", "data['label'] = number.fit_transform(data['label'].astype('str'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#ESCALADO usando: from sklearn import preprocessing\n", "#minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))\n", "#data_minmax = minmax_scaler.fit_transform(data)\n", "#data_minmax = pd.DataFrame(data_minmax, columns=col_names)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "y = data.label\n", "#X = data_minmax.drop('label', axis=1)\n", "X = data.drop('label', axis=1)\n", "X = X.drop('is_host_login', axis=1)\n", "X = X.drop('num_outbound_cmds', axis=1)\n", "X = X.drop('urgent', axis=1)\n", "X = X.drop('su_attempted', axis=1)\n", "X = X.drop('num_shells', axis=1)\n", "X = X.drop('land', axis=1)\n", "X = X.drop('root_shell', axis=1)\n", "X = X.drop('num_failed_logins', axis=1)\n", "X = X.drop('num_file_creations', axis=1)\n", "X = X.drop('num_root', axis=1)\n", "#X = X.drop('is_guest_login', axis=1)\n", "#X = X.drop('num_access_files', axis=1)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training size: 4163666; Test size: 734765\n" ] }, { "data": { "text/plain": [ "(4898431, 31)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Training size: {}; Test size: {}\".format(len(X_train),len(X_test)))\n", "X.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4163666, 31)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(734765, 31)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Arbol de decision" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from sklearn import tree #Arboles de decision\n", "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", "\n", "#El split marca la complejidad del arbol, si ponemos 2 quedaría lo mas complejo posible (over trained)\n", "c = DecisionTreeClassifier(min_samples_split=10)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "features =[\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n", " \"dst_bytes\",\"wrong_fragment\",\"hot\",\n", " \"logged_in\",\"num_compromised\",\n", " \"num_access_files\",\n", " \"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n", " \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n", " \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n", " \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", " \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", " \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\"]\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "23:31:05\n", "23:31:49\n" ] } ], "source": [ "print time.strftime(\"%H:%M:%S\") \n", "dt = c.fit(X_train, y_train)\n", "print time.strftime(\"%H:%M:%S\") " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def show_tree(tree, features, path):\n", " f = io.StringIO()\n", " export_graphviz(tree, out_file=f, feature_names=features)\n", " pydotplus.graph_from_dot_data(f.getvalue()).write_png(path)\n", " img = imageio.imread(path)\n", " plt.rcParams[\"figure.figsize\"] = (20,20)\n", " plt.imshow(img)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "show_tree(dt, features, 'dec_tree_01.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Predicciones con la parte de dataset de test\n", "y_pred = c.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "score = accuracy_score(y_test, y_pred) * 100\n", "print \"Accuracy using Decision Tree: \", score" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 2 }