Primero, las librerías
# Para cada librería preguntamos primero si está instalada
if(!is.element("dplyr", installed.packages()[, 1]))
install.packages("dplyr", repos = 'http://cran.us.r-project.org')
library(dplyr)
if(!is.element("ggplot2", installed.packages()[, 1]))
install.packages("ggplot2", repos = 'http://cran.us.r-project.org')
library(ggplot2)
if(!is.element("lubridate", installed.packages()[, 1]))
install.packages("lubridate", repos = 'http://cran.us.r-project.org')
library(lubridate)
if(!is.element("chron", installed.packages()[, 1]))
install.packages("chron", repos = 'http://cran.us.r-project.org')
library(chron)
if(!is.element("pROC", installed.packages()[, 1]))
install.packages("pROC", repos = 'http://cran.us.r-project.org')
library(pROC)
La de H2O la tratamos aparte. Si hay que instalarla o actualizarla, es mejor descargarla desde su cuenta de AWS
# http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/2/index.html
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# Next, we download packages that H2O depends on.
pkgs <- c("RCurl","jsonlite")
for (pkg in pkgs) {
if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }
}
# Now we download, install and initialize the H2O package for R.
install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/2/R")
Ahora ya podemos cargar el paquete
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following object is masked from 'package:pROC':
##
## var
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## ||, &&, %*%, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, %in%, is.character, is.factor, is.numeric,
## log, log10, log1p, log2, round, signif, trunc
Obtenemos los datos de la web: https://data.lacity.org/A-Safe-City/Crime-Data-from-2010-to-Present/y8tr-7khq/data en la pestaña export y en download as elegimos el formato csv
Cargamos los datos (acordándonos de modificar la ruta)
# Datos descargados el 15-02-2018
crime_data <- read.csv('/media/enero/Disco1/Varios/R/Archivos/clases_r/h2o/data/Crime_Data_from_2010_to_Present-15-02-2018.csv')
Un summary()
summary(crime_data)
## DR.Number Date.Reported Date.Occurred
## Min. : 210 08/07/2017: 805 01/01/2010: 2155
## 1st Qu.:120315238 12/18/2017: 794 01/01/2011: 2069
## Median :140620602 07/24/2017: 783 01/01/2012: 1664
## Mean :138030952 10/23/2017: 771 01/01/2013: 1489
## 3rd Qu.:160624731 11/28/2017: 762 01/01/2014: 1353
## Max. :910220366 01/29/2018: 761 01/01/2015: 1301
## (Other) :1678633 (Other) :1673278
## Time.Occurred Area.ID Area.Name Reporting.District
## Min. : 1 Min. : 1.00 77th Street: 117154 Min. : 100
## 1st Qu.: 930 1st Qu.: 6.00 Southwest : 108517 1st Qu.: 645
## Median :1430 Median :12.00 N Hollywood: 91554 Median :1204
## Mean :1364 Mean :11.14 Pacific : 88964 Mean :1161
## 3rd Qu.:1900 3rd Qu.:16.00 Southeast : 88578 3rd Qu.:1675
## Max. :2359 Max. :21.00 Mission : 84802 Max. :2198
## (Other) :1103740
## Crime.Code Crime.Code.Description
## Min. :110.0 BATTERY - SIMPLE ASSAULT :154114
## 1st Qu.:330.0 BURGLARY FROM VEHICLE :129308
## Median :440.0 VEHICLE - STOLEN :129271
## Mean :506.8 BURGLARY :121439
## 3rd Qu.:626.0 THEFT PLAIN - PETTY ($950 & UNDER):120106
## Max. :956.0 THEFT OF IDENTITY :105852
## (Other) :923219
## MO.Codes Victim.Age Victim.Sex Victim.Descent
## : 182688 Min. :10.00 :155068 H :582840
## 0344 : 182670 1st Qu.:23.00 -: 1 W :414424
## 0329 : 72917 Median :34.00 F:715645 B :269782
## 1501 : 36281 Mean :35.97 H: 56 O :162664
## 0416 : 25262 3rd Qu.:48.00 M:784098 :155104
## 0325 : 22698 Max. :99.00 X: 28441 X : 47013
## (Other):1160793 NA's :135446 (Other): 51482
## Premise.Code Premise.Description
## Min. :101 STREET :375205
## 1st Qu.:102 SINGLE FAMILY DWELLING :346791
## Median :210 MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC):216227
## Mean :312 PARKING LOT :119893
## 3rd Qu.:501 SIDEWALK : 84044
## Max. :971 OTHER BUSINESS : 75323
## NA's :97 (Other) :465826
## Weapon.Used.Code
## Min. :101.0
## 1st Qu.:400.0
## Median :400.0
## Mean :370.6
## 3rd Qu.:400.0
## Max. :516.0
## NA's :1125843
## Weapon.Description
## :1125844
## STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE): 339404
## VERBAL THREAT : 46246
## UNKNOWN WEAPON/OTHER WEAPON : 43517
## HAND GUN : 26849
## SEMI-AUTOMATIC PISTOL : 10620
## (Other) : 90829
## Status.Code Status.Description Crime.Code.1
## IC :1302599 Adult Arrest: 172666 Min. :110.0
## AO : 190613 Adult Other : 190613 1st Qu.:330.0
## AA : 172666 Invest Cont :1302599 Median :440.0
## JA : 13246 Juv Arrest : 13246 Mean :506.7
## JO : 4154 Juv Other : 4154 3rd Qu.:626.0
## CC : 26 UNK : 31 Max. :999.0
## (Other): 5 NA's :6
## Crime.Code.2 Crime.Code.3 Crime.Code.4
## Min. :210.0 Min. : 93.0 Min. :421.0
## 1st Qu.:998.0 1st Qu.:998.0 1st Qu.:998.0
## Median :998.0 Median :998.0 Median :998.0
## Mean :953.3 Mean :970.3 Mean :969.9
## 3rd Qu.:998.0 3rd Qu.:998.0 3rd Qu.:998.0
## Max. :999.0 Max. :999.0 Max. :999.0
## NA's :1576655 NA's :1680910 NA's :1683231
## Address
## 6TH ST : 3897
## 7TH ST : 3014
## 9300 TAMPA AV: 2873
## 5TH ST : 2415
## 6TH : 2409
## 6600 TOPANGA CANYON BL: 2340
## (Other) :1666361
## Cross.Street Location
## :1403492 (0, 0) : 6416
## BROADWAY : 4923 (34.1016, -118.3387): 3192
## FIGUEROA : 2969 (34.1905, -118.6059): 2316
## VERMONT AV: 2953 (33.9892, -118.3089): 2001
## SAN PEDRO : 2821 (34.1576, -118.438) : 1797
## WESTERN AV: 2821 (34.2216, -118.4488): 1684
## (Other) : 263330 (Other) :1665903
Y un str()
str(crime_data)
## 'data.frame': 1683309 obs. of 26 variables:
## $ DR.Number : int 1208575 102005556 418 101822289 42104479 120125367 101105609 101620051 101910498 120908292 ...
## $ Date.Reported : Factor w/ 2963 levels "01/01/2010","01/01/2011",..: 623 217 663 2556 95 67 244 2556 812 743 ...
## $ Date.Occurred : Factor w/ 2963 levels "01/01/2010","01/01/2011",..: 599 190 655 2548 32 67 235 2524 812 130 ...
## $ Time.Occurred : int 1800 2300 2030 1800 2300 1400 2230 1600 1600 800 ...
## $ Area.ID : int 12 20 18 18 21 1 11 16 19 9 ...
## $ Area.Name : Factor w/ 21 levels "77th Street",..: 1 12 15 15 17 2 11 4 8 18 ...
## $ Reporting.District : int 1241 2071 1823 1803 2133 111 1125 1641 1902 904 ...
## $ Crime.Code : int 626 510 510 510 745 110 510 510 510 668 ...
## $ Crime.Code.Description: Factor w/ 143 levels "","ABORTION/ILLEGAL",..: 76 139 139 139 137 40 139 139 139 59 ...
## $ MO.Codes : Factor w/ 370891 levels "","0100","0100 0104",..: 168286 1 1 1 48103 264208 1 1 1 107446 ...
## $ Victim.Age : int 30 NA 12 NA 84 49 NA NA NA 27 ...
## $ Victim.Sex : Factor w/ 6 levels "","-","F","H",..: 3 1 1 1 5 3 1 1 1 3 ...
## $ Victim.Descent : Factor w/ 21 levels "","-","A","B",..: 19 1 1 1 19 19 1 1 1 14 ...
## $ Premise.Code : int 502 101 101 101 501 501 108 101 101 203 ...
## $ Premise.Description : Factor w/ 304 levels "","7TH AND METRO CENTER (NOT LINE SPECIFIC)",..: 193 261 261 261 250 250 214 261 261 206 ...
## $ Weapon.Used.Code : int 400 NA NA NA NA 400 NA NA NA NA ...
## $ Weapon.Description : Factor w/ 80 levels "","AIR PISTOL/REVOLVER/RIFLE/BB GUN",..: 67 1 1 1 1 67 1 1 1 1 ...
## $ Status.Code : Factor w/ 10 levels "","13","19","AA",..: 5 7 7 7 7 4 7 7 7 7 ...
## $ Status.Description : Factor w/ 6 levels "Adult Arrest",..: 2 3 3 3 3 1 3 3 3 3 ...
## $ Crime.Code.1 : int 626 510 510 510 745 110 510 510 510 668 ...
## $ Crime.Code.2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Crime.Code.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Crime.Code.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Address : Factor w/ 71796 levels "00","00000 20TH AV",..: 49292 70835 25051 58794 53019 48048 71755 64955 65708 53176 ...
## $ Cross.Street : Factor w/ 11487 levels "","100TH ST",..: 1 139 1 10697 1 1 1121 10158 3362 1 ...
## $ Location : Factor w/ 61121 levels "","(0, 0)","(33.3427, -118.3258)",..: 9181 20839 4882 6203 45655 24053 34484 52498 60520 45810 ...
Es un dataset grande, de mas de 1,5 millones de observaciones
Vemos los nombres
names(crime_data)
## [1] "DR.Number" "Date.Reported"
## [3] "Date.Occurred" "Time.Occurred"
## [5] "Area.ID" "Area.Name"
## [7] "Reporting.District" "Crime.Code"
## [9] "Crime.Code.Description" "MO.Codes"
## [11] "Victim.Age" "Victim.Sex"
## [13] "Victim.Descent" "Premise.Code"
## [15] "Premise.Description" "Weapon.Used.Code"
## [17] "Weapon.Description" "Status.Code"
## [19] "Status.Description" "Crime.Code.1"
## [21] "Crime.Code.2" "Crime.Code.3"
## [23] "Crime.Code.4" "Address"
## [25] "Cross.Street" "Location"
Vamos a cambiarlas
# Minusculas
names(crime_data) <- tolower(names(crime_data))
# cambiar . por _
names(crime_data) <- gsub('.', '_', names(crime_data), fixed = TRUE)
# Las vemos
names(crime_data)
## [1] "dr_number" "date_reported"
## [3] "date_occurred" "time_occurred"
## [5] "area_id" "area_name"
## [7] "reporting_district" "crime_code"
## [9] "crime_code_description" "mo_codes"
## [11] "victim_age" "victim_sex"
## [13] "victim_descent" "premise_code"
## [15] "premise_description" "weapon_used_code"
## [17] "weapon_description" "status_code"
## [19] "status_description" "crime_code_1"
## [21] "crime_code_2" "crime_code_3"
## [23] "crime_code_4" "address"
## [25] "cross_street" "location"
Algunos cambios en valores para pasarlos a NA
# Los pasamos a caracter
crime_data$mo_codes <- as.character(crime_data$mo_codes)
# Cambiamos los valores en blanco por NA
crime_data$mo_codes[crime_data$mo_codes==''] <- NA
# table(crime_data$mo_code, useNA = 'ifany') # Ocupa mucho
Creamos un objeto con los valores únicos
# Los valores únicos de crime_data$mo_codes
mo_codes <- unique(as.character(crime_data$mo_codes))
# Quitamos los NAs
mo_codes <- mo_codes[!is.na(mo_codes)]
# Sacamos los valores únicos
mo_codes_unique <- unique(paste(mo_codes, collapse = ' '))
# Separamos los valores con ' '
mo_codes <- unique(strsplit(mo_codes_unique, ' '))
# Pasamos a character
mo_codes <- unlist(mo_codes)
# Nos quedamos con los valores únicos
mo_codes <- unique(mo_codes)
# Los ordenamos
mo_codes <- mo_codes[order(mo_codes)]
# Cuantos hay
length(mo_codes)
## [1] 674
Aquí podríamos obtener un one-hot-encoding de los mo_codes. No lo hacemos porque estamos evaluando la eficacia de autoML, pero es uno de los temas a tratar para mejorar el modelado
# Los pasamos a caracter
crime_data$status_code <- as.character(crime_data$status_code)
# Cambiamos los valores en blanco por NA
crime_data$status_code[crime_data$status_code==''] <- NA
# Volvemos a pasar a factor
crime_data$status_code <- as.factor(crime_data$status_code)
# Sacamos una tabla
table(crime_data$status_code, useNA = 'ifany')
##
## 13 19 AA AO CC IC JA JO TH
## 1 1 172666 190613 26 1302599 13246 4154 1
## <NA>
## 2
# Pasamos a caracter
crime_data$cross_street <- as.character(crime_data$cross_street)
# Cambiamos los valores en blanco por NA
crime_data$cross_street[crime_data$cross_street %in% c('')] <- NA
# table(crime_data$cross_street, useNA = 'ifany') # Muy largo
# Volvemos a pasar a factor
crime_data$cross_street <- as.factor(crime_data$cross_street)
# Pasamos a caracter
crime_data$weapon_description <- as.character(crime_data$weapon_description)
# Cambiamos los valores en blanco por NA
crime_data$weapon_description[crime_data$weapon_description %in% c('')] <- NA
# table(crime_data$weapon_description, useNA = 'ifany') # Muy largo
# Volvemos a pasar a factor
crime_data$weapon_description <- as.factor(crime_data$weapon_description)
crime_data$crime_code_description <- as.character(crime_data$crime_code_description)
crime_data$crime_code_description[crime_data$crime_code_description %in% c('')] <- NA
# table(crime_data$crime_code_description, useNA = 'ifany')
# Volvemos a pasar a factor
crime_data$crime_code_description <- as.factor(crime_data$cross_street)
# Pasamos a caracter
crime_data$premise_description <- as.character(crime_data$premise_description)
# Cambiamos los valores en blanco por NA
crime_data$premise_description[crime_data$premise_description %in% c('')] <- NA
# table(crime_data$premise_description, useNA = 'ifany')
# Volvemos a pasar a factor
crime_data$premise_description <- as.factor(crime_data$premise_description)
# Pasamos a caracter
crime_data$address <- as.character(crime_data$address)
# Cambiamos los valores '00' por NA
crime_data$address[crime_data$address=='00'] <- NA
# table(crime_data$address, useNA = 'ifany')
# Pasamos a factor
crime_data$address <- as.factor(crime_data$address)
# Pasamos a caracter
crime_data$victim_sex <- as.character(crime_data$victim_sex)
# Cambiamos los valores en blanco y los '-' por NA
crime_data$victim_sex[crime_data$victim_sex %in% c('', '-')] <- NA
# Una tabla
table(crime_data$victim_sex, useNA = 'ifany')
##
## F H M X <NA>
## 715645 56 784098 28441 155069
# Volvemos a pasar a factor
crime_data$victim_sex <- as.factor(crime_data$victim_sex)
A - Other Asian B - Black C - Chinese D - Cambodian F - Filipino G - Guamanian H - Hispanic/Latin/Mexican I - American Indian/Alaskan Native J - Japanese K - Korean L - Laotian O - Other P - Pacific Islander S - Samoan U - Hawaiian V - Vietnamese W - White X - Unknown Z - Asian Indian
# Renombramos los niveles
levels(crime_data$victim_descent) <- c("", "-", " Other_Asian", "Black",
"Chinese", "Cambodian", "Filipino",
"Guamanian", "Hispanic-Latin-Mexican",
"American_Indian-Alaskan_Native",
"Japanese", "Korean", "Laotian", "Other",
"Pacific_Islander", "Samoan",
"Hawaiian", "Vietnamese", "White",
"Unknown", "Asian_Indian")
# Pasamos a caracter
crime_data$victim_descent <- as.character(crime_data$victim_descent)
# Cambiamos los valores en blanco y los '-' por 'Unknown'
crime_data$victim_descent[crime_data$victim_descent %in% c('', '-')] <- 'Unknown'
# Una tabla
table(crime_data$victim_descent, useNA = 'ifany')
##
## American_Indian-Alaskan_Native Asian_Indian
## 711 58
## Black Cambodian
## 269782 15
## Chinese Filipino
## 645 1981
## Guamanian Hawaiian
## 69 156
## Hispanic-Latin-Mexican Japanese
## 582840 248
## Korean Laotian
## 7531 10
## Other Other_Asian
## 162664 39648
## Pacific_Islander Samoan
## 292 26
## Unknown Vietnamese
## 202118 91
## White
## 414424
# Y volvemos a factor
crime_data$victim_descent <- as.factor(crime_data$victim_descent)
# pasamos a caracter
crime_data$location <- as.character(crime_data$location)
# Los que tienen location '' los pasamos a '(0, 0)'
crime_data$location[crime_data$location==""] <- "(0, 0)"
# table(crime_data$location, useNA = 'ifany')
Creamos un objeto (crime_data_location) y limpiamos crime_data_location, quitando ‘’, ‘(’ y ‘)’
# Copiamos de crime_data$location
crime_data_location <- crime_data$location
# Quitamos los '('
crime_data_location <- gsub('(', '', crime_data_location, fixed = TRUE)
# Quitamos los ')'
crime_data_location <- gsub(')', '', crime_data_location, fixed = TRUE)
# Quitamos los ' '
crime_data_location <- gsub(' ', '', crime_data_location, fixed = TRUE)
temp2 <- strsplit(crime_data_location, ',')
# temp3 <- as.data.frame(temp2)
temp4 <- unlist(temp2)
temp5 <- stringr::str_split(crime_data_location, ',')
# temp6 <- as.data.frame(temp5, optional = TRUE)
# https://stackoverflow.com/questions/4227223/r-list-to-data-frame
temp7 <- do.call(rbind.data.frame, temp2)
# temp8 <- do.call(rbind, lapply(temp2, data.frame, stringsAsFactors=FALSE))
# temp9 <- do.call(rbind, lapply(temp2, matrix, stringsAsFactors=FALSE))
temp10 <- data.frame(matrix(unlist(temp2), nrow=1674208, byrow=T),
stringsAsFactors=FALSE)
Vamos a incluir en crime_data las variables location_x y location_y
# https://stackoverflow.com/questions/4227223/r-list-to-data-frame
crime_data_location_df <- data.frame(matrix(unlist(strsplit(crime_data_location, ',')),
nrow=length(crime_data_location),
byrow=T), stringsAsFactors=FALSE)
# Cambiamos los nombres
names(crime_data_location_df) <- c('location_x', 'location_y')
# Pasamos a numérico
crime_data$location_x <- as.numeric(crime_data_location_df$location_x)
crime_data$location_y <- as.numeric(crime_data_location_df$location_y)
# Cambiamos los 0 por NAs
crime_data$location_x[crime_data$location_x==0] <- NA
crime_data$location_y[crime_data$location_y==0] <- NA
# Cambiamos también los (0, 0) por NAs
crime_data$location[crime_data$location=="(0, 0)"] <- NA
# Y pasamos location a factor (ya tenemos un location_x y location_y numericos)
crime_data$location <- as.factor(crime_data$location)
Limpiamos el entorno
# Borramos dos datasets
rm(crime_data_location_df, crime_data_location)
# Limpiamos y vemos la memoria
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1314514 70.3 3886542 207.6 3409184 182.1
## Vcells 31284553 238.7 93917411 716.6 93746544 715.3
Creamos un contador de NAs para cada columna
# Creamos la nueva variable con un count
na_count <- do.call(rbind, lapply(colnames(crime_data), function(x) {return(data.frame(x, sum(is.na(crime_data[[x]]))))}))
# Cambiamos el nombre de las variables en el dataset
colnames(na_count) <- c('attribute_name', 'na_count')
# na_count$attribute_name <- as.character(na_count$attribute_name)
# Nos quedamos con los distintos de cero
na_count <- na_count[na_count$na_count != 0,]
# Lo vemos en gráfico
plot(na_count)
# Creamos una variable con el mes del crimen
crime_data$crime_month <- as.factor(substr(as.character(crime_data$date_occurred), 1, 2))
# Vemos en tabla
table(crime_data$crime_month, useNA = 'ifany')
##
## 01 02 03 04 05 06 07 08 09 10
## 160757 127346 138530 135394 141384 139076 143685 144028 138439 143885
## 11 12
## 132921 137864
Cambiamos los valores tipo fecha a formato as.Date
crime_data$date_reported <- as.Date(as.character(crime_data$date_reported), '%m/%d/%Y')
crime_data$date_occurred <- as.Date(as.character(crime_data$date_occurred), '%m/%d/%Y')
Ordenamos por fecha
crime_data <- crime_data[order(crime_data$date_occurred), ]
Generamos algunas columnas
Año
# Con lubridate
# crime_data$crime_year <- year(crime_data$date_occurred)
crime_data$crime_year <- as.factor(substr(as.character(crime_data$date_occurred), 1, 4))
table(crime_data$crime_year, useNA = 'ifany')
##
## 2010 2011 2012 2013 2014 2015 2016 2017 2018
## 208631 200251 200888 191887 194632 214212 223690 227931 21187
Mes
# Mes
# Con lubridate
# crime_data$crime_month <- month(crime_data$date_occurred, label = TRUE)
crime_data$crime_month <- as.factor(substr(as.character(crime_data$date_occurred), 6, 7))
table(crime_data$crime_month, useNA = 'ifany')
##
## 01 02 03 04 05 06 07 08 09 10
## 160757 127346 138530 135394 141384 139076 143685 144028 138439 143885
## 11 12
## 132921 137864
Semana
crime_data$crime_week <- week(crime_data$date_occurred)
table(crime_data$crime_week, useNA = 'ifany')
Convertimos el tiempo con el paquete chron
crime_data$crime_time <- times(substr(as.POSIXct(sprintf("%04.0f", crime_data$time_occurred), origin = '1970-01-01', "Asia/Calcutta", format='%H%M'), 12, 19))
Creamos una variable minuto
crime_data$crime_minute <- (crime_data$time_occurred %/% 100)*60 + (crime_data$time_occurred %% 100)*100
table(crime_data$crime_minute, useNA = 'ifany')
##
## 60 100 120 160 180 200 220 240 260 280 300 320
## 20680 29008 17806 16 13081 77 16 9022 29 22 7879 19
## 340 360 380 400 420 440 460 480 500 520 540 560
## 9 12866 26 52 17138 28 34 41277 8995 45 31455 800
## 580 600 620 640 660 680 700 720 740 760 780 800
## 37 34706 789 40 27988 592 86 93519 487 70 30837 432
## 820 840 860 880 900 920 940 960 980 1000 1020 1040
## 155 34841 421 68 39481 550 74 39456 749 3365 44112 795
## 1060 1080 1100 1120 1140 1160 1180 1200 1220 1240 1260 1280
## 1836 50136 983 1577 44675 1100 1205 46920 1355 961 43079 1211
## 1300 1320 1340 1360 1380 1400 1420 1440 1460 1480 1500 1520
## 873 44105 1343 812 34900 1369 1286 122 1346 1639 3755 1305
## 1540 1560 1580 1600 1620 1640 1660 1680 1700 1720 1740 1760
## 1633 2590 1359 1706 2436 1302 2049 1828 1456 2450 1379 1345
## 1780 1800 1820 1840 1860 1880 1900 1920 1940 1960 1980 2000
## 2557 1319 1252 2675 1536 1014 2802 2572 139 2810 3074 2394
## 2020 2040 2060 2080 2100 2120 2140 2160 2180 2200 2220 2240
## 2657 3271 2358 2655 3799 2076 2670 3779 1602 3099 4302 1318
## 2260 2280 2300 2320 2340 2360 2380 2400 2420 2440 2460 2480
## 2626 4034 1194 2564 4341 1179 2173 4842 1900 170 4283 2247
## 2500 2520 2540 2560 2580 2600 2620 2640 2660 2680 2700 2720
## 1285 4671 2232 1100 4623 2868 925 4698 3009 746 4769 3112
## 2740 2760 2780 2800 2820 2840 2860 2880 2900 2920 2940 2960
## 616 4409 3405 573 4113 3826 574 3560 3761 757 166 3570
## 2980 3000 3020 3040 3060 3080 3100 3120 3140 3160 3180 3200
## 909 11729 3597 1000 9782 3738 1238 7778 3844 1308 5242 3628
## 3220 3240 3260 3280 3300 3320 3340 3360 3380 3400 3420 3440
## 1505 4431 3558 1517 4581 3071 1614 6409 2771 1604 10260 163
## 3460 3480 3500 3520 3540 3560 3580 3600 3620 3640 3660 3680
## 1513 13182 1069 1707 12650 976 1554 14451 863 1607 14138 694
## 3700 3720 3740 3760 3780 3800 3820 3840 3860 3880 3900 3920
## 1659 14330 522 1497 14500 446 1374 16570 483 1161 18353 777
## 3940 3960 3980 4000 4020 4040 4060 4080 4100 4120 4140 4160
## 112 18389 837 1950 19603 992 2084 20945 1155 1739 20827 1251
## 4180 4200 4220 4240 4260 4280 4300 4320 4340 4360 4380 4400
## 1377 20649 1364 938 20509 1444 1066 19558 1425 1259 16228 1608
## 4420 4440 4460 4480 4500 4520 4540 4560 4580 4600 4620 4640
## 1871 130 1470 1926 2909 1539 2191 2805 1443 2534 2317 1569
## 4660 4680 4700 4720 4740 4760 4780 4800 4820 4840 4860 4880
## 2924 1815 1602 2933 1517 1405 3117 1598 1275 3258 2171 1233
## 4900 4920 4940 4960 4980 5000 5020 5040 5060 5080 5100 5120
## 3278 3354 138 3338 3283 1952 3303 3678 2230 3333 3983 1723
## 5140 5160 5180 5200 5220 5240 5260 5280 5300 5320 5340 5360
## 3391 4639 1347 3381 4672 1107 3419 4556 1070 2951 4699 1396
## 5380 5400 5420 5440 5460 5480 5500 5520 5540 5560 5580 5600
## 2795 5067 2073 125 5271 2053 932 5346 2369 931 5408 2518
## 5620 5640 5660 5680 5700 5720 5740 5760 5780 5800 5820 5840
## 832 5426 2912 625 5162 3014 500 4983 3350 517 4047 3311
## 5860 5880 5900 5920 5940 5960 5980 6000 6020 6040 6060 6080
## 592 4708 3290 824 130 3167 869 133 3470 970 135 3369
## 6100 6120 6140 6160 6180 6200 6220 6240 6260 6280 6300 6320
## 1110 147 3356 1426 149 3129 1297 103 3079 1301 129 2927
## 6340 6360 6380 6400 6420 6440 6460 6480 6500 6520 6540 6560
## 1397 104 3609 1280 110 116 1257 105 115 1480 84 151
## 6580 6600 6620 6640 6660 6680 6700 6720 6740 6760 6780 6800
## 1390 87 99 1361 78 118 1361 83 72 1214 69 79
## 6820 6840 6860 6880 6900 6920 6940 6960 6980 7000 7020 7040
## 1082 43 76 2666 28 69 51 26 62 38 29 26
## 7060 7080 7100 7120 7160 7180 7220 7280
## 40 32 34 51 27 41 29 1121
Establecemos algunas franjas horarias
crime_data$crime_time_interval <- cut(crime_data$crime_time, breaks = times(c('00:00:00', '05:00:00', '09:00:00', '13:00:00', '17:00:00', '21:00:00', '23:59:00')), labels = c('Late_night', 'Early_morning', 'Late_morning', 'Early_afternoon', 'Late_afternoon', 'Evening'), include.lowest = TRUE)
table(crime_data$crime_time_interval, useNA = 'ifany')
##
## Late_night Early_morning Late_morning Early_afternoon
## 218525 186980 336565 340978
## Late_afternoon Evening
## 385880 214381
Eliminamos la variable crime_time, ya que es equivalente a crime_minute
# crime_data$crime_time <- as.factor(crime_data$crime_time)
crime_data$crime_time <- NULL
Día de la semana
crime_data$occurred_week_day <- factor(weekdays(crime_data$date_occurred),
levels = c("lunes", "martes",
"miércoles", "jueves",
"viernes", "sábado",
"domingo"))
table(crime_data$occurred_week_day, useNA = 'ifany')
##
## lunes martes miércoles jueves viernes sábado domingo
## 239881 235817 238315 236632 259729 242043 230892
Tiempo de retraso a la hora de denunciar
crime_data$delay_reported <- crime_data$date_reported - crime_data$date_occurred
crime_data$delay_reported <- as.numeric(crime_data$delay_reported)
# En esta parte se crean nuevas variables a partir de la suma y de la media
# No se ejecuta al tener eval=FALSE
week_day_occurred_sum <- aggregate(date_occurred ~ occurred_week_day,
data = crime_data, "sum")
week_day_occurred_mean <- aggregate(date_occurred ~ occured_week_day,
data = crime_data, "mean")
all_sum <- aggregate(target ~ date_occurred + occurred_week_day,
data = train, "sum")
all_mean <- aggregate(target ~ date_occurred + occured_week_day,
data = train, "mean")
plot(all_mean$date_occurred, all_mean$target)
Otro summary()
summary(crime_data)
## dr_number date_reported date_occurred
## Min. : 210 Min. :2010-01-01 Min. :2010-01-01
## 1st Qu.:120315238 1st Qu.:2012-02-11 1st Qu.:2012-01-21
## Median :140620602 Median :2014-04-13 Median :2014-03-21
## Mean :138030952 Mean :2014-03-08 Mean :2014-02-19
## 3rd Qu.:160624731 3rd Qu.:2016-04-13 3rd Qu.:2016-03-28
## Max. :910220366 Max. :2018-02-10 Max. :2018-02-10
##
## time_occurred area_id area_name reporting_district
## Min. : 1 Min. : 1.00 77th Street: 117154 Min. : 100
## 1st Qu.: 930 1st Qu.: 6.00 Southwest : 108517 1st Qu.: 645
## Median :1430 Median :12.00 N Hollywood: 91554 Median :1204
## Mean :1364 Mean :11.14 Pacific : 88964 Mean :1161
## 3rd Qu.:1900 3rd Qu.:16.00 Southeast : 88578 3rd Qu.:1675
## Max. :2359 Max. :21.00 Mission : 84802 Max. :2198
## (Other) :1103740
## crime_code crime_code_description
## Min. :110.0 BROADWAY : 4923
## 1st Qu.:330.0 FIGUEROA : 2969
## Median :440.0 VERMONT AV: 2953
## Mean :506.8 SAN PEDRO : 2821
## 3rd Qu.:626.0 WESTERN AV: 2821
## Max. :956.0 (Other) : 263330
## NA's :1403492
## mo_codes victim_age victim_sex
## Length:1683309 Min. :10.00 F :715645
## Class :character 1st Qu.:23.00 H : 56
## Mode :character Median :34.00 M :784098
## Mean :35.97 X : 28441
## 3rd Qu.:48.00 NA's:155069
## Max. :99.00
## NA's :135446
## victim_descent premise_code
## Hispanic-Latin-Mexican:582840 Min. :101
## White :414424 1st Qu.:102
## Black :269782 Median :210
## Unknown :202118 Mean :312
## Other :162664 3rd Qu.:501
## Other_Asian : 39648 Max. :971
## (Other) : 11833 NA's :97
## premise_description weapon_used_code
## STREET :375205 Min. :101.0
## SINGLE FAMILY DWELLING :346791 1st Qu.:400.0
## MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC):216227 Median :400.0
## PARKING LOT :119893 Mean :370.6
## SIDEWALK : 84044 3rd Qu.:400.0
## (Other) :537862 Max. :516.0
## NA's : 3287 NA's :1125843
## weapon_description
## STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE): 339404
## VERBAL THREAT : 46246
## UNKNOWN WEAPON/OTHER WEAPON : 43517
## HAND GUN : 26849
## SEMI-AUTOMATIC PISTOL : 10620
## (Other) : 90829
## NA's :1125844
## status_code status_description crime_code_1
## IC :1302599 Adult Arrest: 172666 Min. :110.0
## AO : 190613 Adult Other : 190613 1st Qu.:330.0
## AA : 172666 Invest Cont :1302599 Median :440.0
## JA : 13246 Juv Arrest : 13246 Mean :506.7
## JO : 4154 Juv Other : 4154 3rd Qu.:626.0
## (Other): 29 UNK : 31 Max. :999.0
## NA's : 2 NA's :6
## crime_code_2 crime_code_3 crime_code_4
## Min. :210.0 Min. : 93.0 Min. :421.0
## 1st Qu.:998.0 1st Qu.:998.0 1st Qu.:998.0
## Median :998.0 Median :998.0 Median :998.0
## Mean :953.3 Mean :970.3 Mean :969.9
## 3rd Qu.:998.0 3rd Qu.:998.0 3rd Qu.:998.0
## Max. :999.0 Max. :999.0 Max. :999.0
## NA's :1576655 NA's :1680910 NA's :1683231
## address
## 6TH ST : 3897
## 7TH ST : 3014
## 9300 TAMPA AV: 2873
## 5TH ST : 2415
## 6TH : 2409
## (Other) :1668668
## NA's : 33
## cross_street location
## BROADWAY : 4923 (34.1016, -118.3387): 3192
## FIGUEROA : 2969 (34.1905, -118.6059): 2316
## VERMONT AV: 2953 (33.9892, -118.3089): 2001
## SAN PEDRO : 2821 (34.1576, -118.438) : 1797
## WESTERN AV: 2821 (34.2216, -118.4488): 1684
## (Other) : 263330 (Other) :1665894
## NA's :1403492 NA's : 6425
## location_x location_y crime_month crime_year
## Min. :33.34 Min. :-118.8 01 :160757 2017 :227931
## 1st Qu.:34.01 1st Qu.:-118.4 08 :144028 2016 :223690
## Median :34.06 Median :-118.3 10 :143885 2015 :214212
## Mean :34.08 Mean :-118.4 07 :143685 2010 :208631
## 3rd Qu.:34.18 3rd Qu.:-118.3 05 :141384 2012 :200888
## Max. :34.79 Max. :-117.7 06 :139076 2011 :200251
## NA's :6425 NA's :6425 (Other):810494 (Other):407706
## crime_minute crime_time_interval occurred_week_day
## Min. : 60 Late_night :218525 lunes :239881
## 1st Qu.: 840 Early_morning :186980 martes :235817
## Median :1400 Late_morning :336565 miércoles:238315
## Mean :2382 Early_afternoon:340978 jueves :236632
## 3rd Qu.:3960 Late_afternoon :385880 viernes :259729
## Max. :7280 Evening :214381 sábado :242043
## domingo :230892
## delay_reported
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 1.00
## Mean : 16.76
## 3rd Qu.: 2.00
## Max. :2941.00
##
Estado del informe del crimen. Vemos una tabla con las opciones
table(crime_data$status_description, useNA = 'ifany')
##
## Adult Arrest Adult Other Invest Cont Juv Arrest Juv Other
## 172666 190613 1302599 13246 4154
## UNK
## 31
Creamos una variable para predecir, target
# Empezamos con todos los valores como 'Closed'
crime_data$target <- 'Closed'
# Y ahora los que crime_data$status_description=='Invest Cont' pasan a 'Open'
crime_data$target[crime_data$status_description=='Invest Cont'] <- 'Open'
# Una tabla
table(crime_data$target, useNA = 'ifany')
##
## Closed Open
## 380710 1302599
# Y pasamos a factor
crime_data$target <- as.factor(crime_data$target)
# training hasta el año 2016
training <- crime_data[crime_data$date_occurred<as.Date('2017-01-01'), ]
# Validating el año 2017
validating <- crime_data[(crime_data$date_occurred>as.Date('2016-12-31') & crime_data$date_occurred<as.Date('2018-01-01')), ]
# testing el año 2018
testing <- crime_data[crime_data$date_occurred>as.Date('2017-12-31'), ]
# Iniciamos con todos los cores posibles y la memoria de 32G (si hay menos se
# autoregula)
h2o.init(nthreads=-1, max_mem_size="32G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 45 minutes 43 seconds
## H2O cluster version: 3.16.0.2
## H2O cluster version age: 3 months and 12 days !!!
## H2O cluster name: H2O_started_from_R_enero_ahz489
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.35 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (3 months and 12 days)!
## Please download and install the latest version from http://h2o.ai/download/
# Si queremos conectar a un cluster que ya está funcionando
# h2o.init(ip="mycluster", port=55555)
Pasamos los objetos a H2O y les damos nombre para poder identificarlos
training_h2o <- as.h2o(training,
destination_frame = "training")
##
|
| | 0%
|
|=================================================================| 100%
validating_h2o <- as.h2o(validating,
destination_frame = "validating")
##
|
| | 0%
|
|=================================================================| 100%
testing_h2o <- as.h2o(testing,
destination_frame = "testing")
##
|
| | 0%
|
|=================================================================| 100%
Tras estos pasos en los que someramente hemos hecho ETL, tratamiento de NAs y algunas variables nuevas, pasamos a la parte de autoML
# La columna target
y <- "target"
# Las otras variables a usar, primero todas menos el target
x <- setdiff(names(training), y)
# Ahora quitamos también: 'dr_number', 'status_code', 'status_description'
x <- x[!(x %in% c('dr_number', 'status_code', 'status_description'))]
Llamamos a automl. Solo le decimos los datos, la columna target, que hay un dataset de train y uno de validación y cuantos minutos de ejecución
tiempo_inicio <- Sys.time()
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = training_h2o,
validation_frame = validating_h2o,
# leaderboard_frame = test_h2o,
max_runtime_secs = 3600, # 180
stopping_metric = "AUTO")
##
|
| | 0%
|
|== | 3%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|=================================================================| 100%
##
|
| | 0%
|
|=================================================================| 100%
print(Sys.time()-tiempo_inicio)
## Time difference of 1.099437 hours
Podemos ver lo que hace H2O en: http://localhost:54321/flow/index.html
Creamos un objeto con el mejor modelo
automl_leader <- automl_models_h2o@leader
# Pasamos el modelo y los datos nuevos
pred_testing <- as.data.frame(h2o.predict(automl_leader, newdata = testing_h2o))
##
|
| | 0%
|
|=================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'crime_code_description' has levels not trained
## on: [238TH ST, AARON ST, ALSACE DR, ALVA, APPLETON WY, ARMACOST BL, ARTESIA
## BL, AVE OF STARS, BALI WY, BARBARA AV, BELOIT BL, BRIGHT, BROADLEAF AV,
## BROOKE, BUTTE ST, CALDUS AV, CAMINO DEL ORO ST, CARPENTER PL, CIMMARON BL,
## CLOVERDALE RD, CORINGA DR, CRESTKNOLL DR, DOBBS, DOHENY AV, DON LORENZO
## ST, DOREEN PL, E CESAR E CHAVEZ, E DENKER AV, E HILL ST, E KOHLER ST,
## E NEW HAMPSHIRE AV, E OXFORD AV, E SPRING ST, E SPROULE AV, EDGECLIFFE
## DR, ESCARPA DR, EUNICE, FAIRVIEW BL, FALKIRK LN, FORNEY, GENESSEE AV,
## GLADBECK, GRAND PARK, GROSVENOR BL, JACON WY, JARVIS, LUCIA PL, MARKLEIN
## AV, MATEO AV, MERCHANT ST, MERIT AV, METRO, MILKY WY, N 30TH ST, N 40TH
## PL, N BESSEMER ST, N POLK, PARMALEE AV, PASADENA ST, PIER, RAPHAEL, REAZA
## PL, REDMONT AV, REFORMA, RINCONIA DR, S 28TH ST, S CLINTON ST, S MULLEN AV,
## S PACIFIC AV, S SATICOY ST, S SAVANNAH AV, SAMPSON AV, SHERWOOD, SMILEY,
## STONE OAK DR, THE GROVE WY, TUPACHI ST, URBANA, VALIJEAN AV, VAN NUYS PL,
## VAN NUYS ST, VESTAL AV, W 74TH ST, W ANAHEIM ST, W GAGE ST, W ORANGE DR,
## WEDGEWOOD PL, WEYBURN PL, ZPHYR]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'premise_description' has levels not trained on:
## [7TH AND METRO CENTER (NOT LINE SPECIFIC), BUS DEPOT/TERMINAL, OTHER THAN
## MTA, ELECTRONICS STORE (IE:RADIO SHACK, ETC.), FURNITURE STORE, LA UNION
## STATION (NOT LINE SPECIFIC), MTA - BLUE LINE - 103RD/WATTS TOWERS, MTA -
## BLUE LINE - 7TH AND METRO CENTER, MTA - BLUE LINE - GRAND/LATTC, MTA - BLUE
## LINE - PICO, MTA - BLUE LINE - SAN PEDRO, MTA - BLUE LINE - WASHINGTON,
## MTA - EXPO LINE - 7TH AND METRO CENTER, MTA - EXPO LINE - EXPO PARK/USC,
## MTA - EXPO LINE - EXPO/BUNDY, MTA - EXPO LINE - EXPO/CRENSHAW, MTA - EXPO
## LINE - EXPO/LA BREA, MTA - EXPO LINE - EXPO/SEPULVEDA, MTA - EXPO LINE -
## EXPO/VERMONT, MTA - EXPO LINE - EXPO/WESTERN, MTA - EXPO LINE - FARMDALE,
## MTA - EXPO LINE - JEFFERSON/USC, MTA - EXPO LINE - LA CIENEGA/JEFFERSON,
## MTA - EXPO LINE - LATTC/ORTHO INSTITUTE, MTA - EXPO LINE - PALMS, MTA -
## GOLD LINE - HIGHLAND PARK, MTA - GOLD LINE - MARIACHI PLAZA, MTA - GOLD
## LINE - SOUTHWEST MUSEUM, MTA - GOLD LINE - UNION STATION, MTA - GREEN LINE
## - AVALON, MTA - GREEN LINE - AVIATION/LAX, MTA - GREEN LINE - HARBOR FWY,
## MTA - PURPLE LINE - CIVIC CENTER/GRAND PARK, MTA - PURPLE LINE - PERSHING
## SQUARE, MTA - PURPLE LINE - WILSHIRE/VERMONT, MTA - RED LINE - 7TH AND
## METRO CENTER, MTA - RED LINE - CIVIC CENTER/GRAND PARK, MTA - RED LINE
## - HOLLYWOOD/HIGHLAND, MTA - RED LINE - HOLLYWOOD/VINE, MTA - RED LINE -
## HOLLYWOOD/WESTERN, MTA - RED LINE - NORTH HOLLYWOOD, MTA - RED LINE - UNION
## STATION, MTA - RED LINE - UNIVERSAL CITY/STUDIO CITY, MTA - RED LINE -
## VERMONT/BEVERLY, MTA - RED LINE - VERMONT/SANTA MONICA, MTA - RED LINE -
## VERMONT/SUNSET, MTA - RED LINE - WESTLAKE/MACARTHUR PARK, MTA - RED LINE
## - WILSHIRE/VERMONT, MTA - SILVER LINE - HARBOR FWY, MTA - SILVER LINE -
## HARBOR GATEWAY TRANSIT CTR, MTA - SILVER LINE - ROSECRANS, MUSCLE BEACH,
## SEX ORIENTED/BOOK STORE/STRIP CLUB/GENTLEMAN'S CLUB, SHORT-TERM VACATION
## RENTAL, TERMINAL, OTHER THAN MTA, TRAIN DEPOT/TERMINAL, OTHER THAN MTA,
## TRANSITIONAL HOUSING/HALFWAY HOUSE, VETERINARIAN/ANIMAL HOSPITAL]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'address' has levels not trained on: [00 PORTOLA
## RD, 00000 LOYOLA MARYM DR, 04600 HUNTINGTON DR, 100 COPA DE ORO RD, 100
## HILGARD AV, 100 MESA ST, 1000 SPENCER ST, 1000 S BUDLONG, 1000 W 2ND ST,
## 10000 SANTA MONICA BL, 10100 GRANDEE AV, 10100 MOORPARK ST, 10100 S ALAMEDA
## ST, 10300 STATE ROUTE, 10500 CALIFAST AV, 10500 SELKIRK LN, 10600 JORDAN
## AV, 10600 LANARK ST, 10800 OSTEGO ST, 11000 COLUMBUS AV, 112TH PL, 11300
## BABBITT AV, 11300 S OLIVE ST, 11TH DR, 1200 LOS LOMAS AV, 1200 PRODUCE,
## 1200 TOWER RD, 12000 MARIBEL LN, 12000 SHERMAN RD, 12100 ANETA ST, 12200
## MILLENNIUM, 12300 5TH HELENA DR, 12300 CENTINELA AV, 12300 VICTORIA AV,
## 12400 HART ST, 12600 BLUFF CREEK DR, 12600 HUSTON ST, 12900 HOFFMAN ST,
## 13500 RIVERSIDE DR, 13500 SILVER PALMS LN, 13800 KAGEL CANYON ST, 1400
## BECKWITH AV, 1400 CALLE DEL JONELA, 1400 N BERENDO ST, 14200 CANTLAY ST,
## 14800 ASTORIA, 1500 EXPOSITION BL, 1500 W 249TH ST, 15700 HIGH KNOLL RD,
## 1600 E 3RD ST, 1600 S NEW ENGLAND ST, 16300 N SAN FERNANDO MISSION BL,
## 16400 MIRILLA ST, 16600 OLDHAM PL, 169TH ST, 1700 N NEW HAMPSHIRE, 1700 S
## BOYLE AV, 18200 CHATHAM LN, 18600 LEMAY ST, 189TH ST, 1900 PACIFIC COAST
## HY, 1900 RAPALLO PL, 19000 WINNETKA AV, 19300 LAVY CT, 19300 OLYMPIA
## ST, 19500 EMPIRE LN, 19800 CASTLEBAY LN, 19800 ITASCA ST, 200 ENERO CT,
## 200 LAKESHORE TR, 200 S INDIANA ST, 200 S SALTAIR AV, 200 W WORLD WY,
## 2000 TESLA AV, 20000 DAMPA LN, 20600 DUMONT ST, 2100 S BUDLONG AV, 2200
## BRENTFORD RD, 2200 MALAGA RD, 2200 RIPPLE ST, 22000 LINFIELD LN, 22TH,
## 2300 SKIRBALL CENTER DR, 23600 MARTHA ST, 2400 FERN DELL, 24000 SCHOENBORN
## ST, 2700 PATRICIA AV, 27700 HOLLYRIDGE DR, 300 E 38TH ST, 300 S AVENUE
## 16, 3100 BUDAN AV, 3200 MCCARTHY DR, 3200 WONDER VIEW DR, 3400 EXPOSITION
## BL, 3400 W MARTIN LUTHER KING BL, 400 S GRAND VIEW ST, 4000 FOLSOM ST,
## 4000 VIA MIRASOL, 4000 S CENTINELA AV, 4100 S GRAND AV, 4100 W 42ND PL,
## 4600 HUNTINGTON DR SOUTH, 4800 MT HELENA AV, 4800 SANDFORD DR, 4800 THOMAS
## ST, 500 S AVENUE 17, 5100 GALVEZ ST, 5100 LATHROP ST, 5300 DANNY AV, 5300
## DURATE ST, 5500 DUBOIS AV, 5700 ROLLING RD, 5800 DREXEL AV, 5900 W 83RD
## ST, 600 DUCOMMUN ST, 600 E L ST, 6700 BERQUIST AV, 6900 VANASCOY AV, 700
## ARMACOST AV, 700 SHORE, 700 VENICE WY, 7000 RIDGE AV, 7200 N MAPLE RD,
## 7400 CLIFFSIDE CT, 7700 WOODHALL AV, 800 APPLEBY, 800 MILWOOD CT, 800
## SOUTHERLAND AV, 800 TACUBA ST, 8000 PARK HILL DR, 8100 SHADY GLADE AV,
## 8300 BARHAM BL, 8300 BIG CANYON DR, 900 COOPER AV, 900 E 18TH ST, 900 E
## MARTIN LUTHER KING BL, 900 N STALEY LN, 900 S UNION AV, 9100 LARKE ELLEN
## CI, 9200 VENA AV, 93200 GRAHAM AV, 9400 GLORY AV, 9500 REGENT ST, ALTIVO
## WY, BARMAN AV, BIRD, BROADMOORE ST, BUENA VISTA DR, CAHUENGA BL, CALDUS
## AV, CASS, CENTURY PARK DR, CIVIC CENTER, COPPER, CRAFT AV, CRAVEN ST, DENNI
## AV, DOLPHIN AV, E 104TH ST, E AVENUE 28, E AVENUE 33, EARL ST, ELKLAND PL,
## FACTORY BL, GARFIELD ST, GAZETTE ST, GRAHAM PL, HAYES AV, HOSTETER FIRE
## RD, JAMES M WOODS BL, LA FAYETTE P, LONG BEACH FY, LONGFELLOW, MC CLURE
## AV, MIDWOOD DR, MONTECITO ST, MOUNT SHASTA, N 97TH ST, PEMBROKE, PERRIS ST,
## PETROLEUM, RAJAH, RIDGELEY ST, RINCONIA DR, ROXBURY DR, S MAPLE AV, S SAINT
## LOUIS ST, S WESTLAKE, SCARFF ST, STONEHAVEN, SUMMIT, SWEETZER DR, TRUMAN
## ST, VALLEY MEADOW, VANOWNEN ST, W 106TH ST, WEST WAY, WHITESIDE, WHITLEY
## PL]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'cross_street' has levels not trained on: [238TH
## ST, AARON ST, ALSACE DR, ALVA, APPLETON WY, ARMACOST BL, ARTESIA BL, AVE OF
## STARS, BALI WY, BARBARA AV, BELOIT BL, BRIGHT, BROADLEAF AV, BROOKE, BUTTE
## ST, CALDUS AV, CAMINO DEL ORO ST, CARPENTER PL, CIMMARON BL, CLOVERDALE
## RD, CORINGA DR, CRESTKNOLL DR, DOBBS, DOHENY AV, DON LORENZO ST, DOREEN
## PL, E CESAR E CHAVEZ, E DENKER AV, E HILL ST, E KOHLER ST, E NEW HAMPSHIRE
## AV, E OXFORD AV, E SPRING ST, E SPROULE AV, EDGECLIFFE DR, ESCARPA DR,
## EUNICE, FAIRVIEW BL, FALKIRK LN, FORNEY, GENESSEE AV, GLADBECK, GRAND PARK,
## GROSVENOR BL, JACON WY, JARVIS, LUCIA PL, MARKLEIN AV, MATEO AV, MERCHANT
## ST, MERIT AV, METRO, MILKY WY, N 30TH ST, N 40TH PL, N BESSEMER ST, N POLK,
## PARMALEE AV, PASADENA ST, PIER, RAPHAEL, REAZA PL, REDMONT AV, REFORMA,
## RINCONIA DR, S 28TH ST, S CLINTON ST, S MULLEN AV, S PACIFIC AV, S SATICOY
## ST, S SAVANNAH AV, SAMPSON AV, SHERWOOD, SMILEY, STONE OAK DR, THE GROVE
## WY, TUPACHI ST, URBANA, VALIJEAN AV, VAN NUYS PL, VAN NUYS ST, VESTAL AV,
## W 74TH ST, W ANAHEIM ST, W GAGE ST, W ORANGE DR, WEDGEWOOD PL, WEYBURN PL,
## ZPHYR]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'location' has levels not trained on:
## [(33.7315, -118.2971), (33.7372, -118.317), (33.7379, -118.3065),
## (33.7529, -118.2971), (33.7851, -118.2808), (33.7875, -118.2507),
## (33.7921, -118.2563), (33.7987, -118.3053), (33.8614, -118.285),
## (33.8698, -118.2849), (33.878, -118.2886), (33.933, -118.2402), (33.933,
## -118.2794), (33.937, -118.2792), (33.9389, -118.2328), (33.9454,
## -118.2283), (33.947, -118.243), (33.9555, -118.2749), (33.9751, -118.4211),
## (33.9766, -118.3946), (33.9768, -118.4634), (33.9773, -118.2916),
## (33.9795, -118.4082), (33.9799, -118.3703), (33.9818, -118.4683),
## (33.982, -118.4685), (33.984, -118.4017), (33.9853, -118.4092), (33.9869,
## -118.4728), (33.9881, -118.4146), (33.9893, -118.2438), (33.9943,
## -118.4578), (33.9945, -118.4615), (33.9996, -118.459), (34.0006,
## -118.4312), (34.0055, -118.4429), (34.0073, -118.4347), (34.0092,
## -118.259), (34.0209, -118.4373), (34.0222, -118.2394), (34.0249,
## -118.3551), (34.026, -118.3508), (34.0271, -118.2537), (34.0274,
## -118.3973), (34.0294, -118.4367), (34.0343, -118.2445), (34.0367,
## -118.2964), (34.0369, -118.473), (34.0371, -118.2963), (34.0384,
## -118.4572), (34.0392, -118.4141), (34.0419, -118.2877), (34.0421,
## -118.2112), (34.0434, -118.1929), (34.0438, -118.3342), (34.0477,
## -118.4631), (34.048, -118.3878), (34.0495, -118.2237), (34.0505,
## -118.2599), (34.0513, -118.2222), (34.0522, -118.2342), (34.0532,
## -118.4786), (34.0533, -118.4958), (34.0543, -118.2031), (34.0606,
## -118.3496), (34.0637, -118.2864), (34.064, -118.4141), (34.0675,
## -118.2214), (34.0717, -118.4628), (34.072, -118.2103), (34.0723,
## -118.3658), (34.078, -118.568), (34.0783, -118.2057), (34.0789, -118.4397),
## (34.079, -118.2969), (34.0898, -118.3244), (34.0915, -118.2458), (34.0917,
## -118.213), (34.0948, -118.2495), (34.1009, -118.283), (34.1042, -118.2647),
## (34.1053, -118.2504), (34.1109, -118.3079), (34.1112, -118.1868),
## (34.1112, -118.4483), (34.1113, -118.3451), (34.1121, -118.1887),
## (34.1167, -118.3256), (34.1198, -118.3179), (34.1211, -118.3193),
## (34.1218, -118.1744), (34.1218, -118.2239), (34.1226, -118.1738), (34.123,
## -118.4792), (34.1302, -118.21), (34.142, -118.2699), (34.1432, -118.4947),
## (34.1464, -118.1957), (34.1566, -118.3734), (34.1576, -118.4295),
## (34.1594, -118.4076), (34.1616, -118.5822), (34.1635, -118.6071),
## (34.1701, -118.6398), (34.1707, -118.5203), (34.1747, -118.6097),
## (34.1877, -118.5183), (34.1892, -118.4774), (34.191, -118.5386), (34.1912,
## -118.5244), (34.1922, -118.6346), (34.193, -118.39), (34.1975, -118.4705),
## (34.1975, -118.4726), (34.1992, -118.5873), (34.2012, -118.4043), (34.2018,
## -118.394), (34.2024, -118.4424), (34.2056, -118.65), (34.206, -118.6344),
## (34.2104, -118.6368), (34.2205, -118.6351), (34.228, -118.3693),
## (34.2294, -118.4454), (34.2322, -118.5815), (34.2364, -118.5361),
## (34.2393, -118.2798), (34.2418, -118.4338), (34.2429, -118.5675),
## (34.2447, -118.5657), (34.2468, -118.4512), (34.255, -118.5393),
## (34.2566, -118.4332), (34.2622, -118.6039), (34.2629, -118.2923),
## (34.2636, -118.6067), (34.2656, -118.553), (34.2674, -118.4664),
## (34.2697, -118.4027), (34.2722, -118.463), (34.2735, -118.5697), (34.2761,
## -118.4259), (34.2818, -118.5309), (34.2846, -118.3923), (34.2931,
## -118.5538), (34.3117, -118.4186), (34.3218, -118.4468), (34.3265,
## -118.4341)]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'crime_year' has levels not trained on: [2018]
Vemos el objeto de la predicción
str(pred_testing)
## 'data.frame': 21187 obs. of 3 variables:
## $ predict: Factor w/ 2 levels "Closed","Open": 2 2 2 2 2 2 2 2 2 2 ...
## $ Closed : num 0.0531 0.0603 0.1482 0.0616 0.1985 ...
## $ Open : num 0.947 0.94 0.852 0.938 0.801 ...
Un resumen de la performance en el objeto de test
h2o.performance(automl_leader, newdata = testing_h2o)
## H2OBinomialMetrics: stackedensemble
##
## MSE: 0.1189653
## RMSE: 0.3449135
## LogLoss: 0.3745473
## Mean Per-Class Error: 0.4982028
## AUC: 0.7885211
## Gini: 0.5770422
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Closed Open Error Rate
## Closed 8 2095 0.996196 =2095/2103
## Open 4 19080 0.000210 =4/19084
## Totals 12 21175 0.099070 =2099/21187
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.089551 0.947863 397
## 2 max f2 0.079476 0.978436 399
## 3 max f0point5 0.393782 0.923999 290
## 4 max accuracy 0.089551 0.900930 397
## 5 max precision 0.950663 1.000000 0
## 6 max recall 0.079476 1.000000 399
## 7 max specificity 0.950663 1.000000 0
## 8 max absolute_mcc 0.708349 0.297129 162
## 9 max min_per_class_accuracy 0.731609 0.727532 151
## 10 max mean_per_class_accuracy 0.758218 0.731465 137
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
as.data.frame(automl_models_h2o@leaderboard)
## model_id auc logloss
## 1 StackedEnsemble_AllModels_0_AutoML_20180313_122540 NA NA
## 2 StackedEnsemble_BestOfFamily_0_AutoML_20180313_122540 NA NA
## 3 DRF_0_AutoML_20180313_122540 NA NA
## 4 XRT_0_AutoML_20180313_122540 NA NA
## 5 GBM_grid_0_AutoML_20180313_122540_model_0 NA NA
Y vemos el modelo
automl_leader
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_0_AutoML_20180313_122540
## NULL
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.07489115
## RMSE: 0.2736625
## LogLoss: 0.2651635
## Mean Per-Class Error: 0.1527908
## AUC: 0.9428592
## Gini: 0.8857184
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Closed Open Error Rate
## Closed 245185 86336 0.260424 =86336/331521
## Open 49794 1052876 0.045158 =49794/1102670
## Totals 294979 1139212 0.094918 =136130/1434191
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.556156 0.939279 210
## 2 max f2 0.301893 0.962042 296
## 3 max f0point5 0.756018 0.939519 137
## 4 max accuracy 0.583495 0.905195 201
## 5 max precision 0.952286 1.000000 0
## 6 max recall 0.065308 1.000000 394
## 7 max specificity 0.952286 1.000000 0
## 8 max absolute_mcc 0.626318 0.727934 188
## 9 max min_per_class_accuracy 0.805609 0.869694 115
## 10 max mean_per_class_accuracy 0.766066 0.871189 133
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.1183872
## RMSE: 0.3440744
## LogLoss: 0.3783586
## Mean Per-Class Error: 0.3445659
## AUC: 0.8457715
## Gini: 0.691543
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Closed Open Error Rate
## Closed 16495 30591 0.649684 =30591/47086
## Open 7134 173711 0.039448 =7134/180845
## Totals 23629 204302 0.165511 =37725/227931
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.331162 0.902050 304
## 2 max f2 0.129093 0.951283 377
## 3 max f0point5 0.703089 0.894299 160
## 4 max accuracy 0.391017 0.835933 282
## 5 max precision 0.950596 0.998028 0
## 6 max recall 0.063875 1.000000 399
## 7 max specificity 0.950596 0.999958 0
## 8 max absolute_mcc 0.627734 0.473369 192
## 9 max min_per_class_accuracy 0.781789 0.767426 123
## 10 max mean_per_class_accuracy 0.802383 0.768738 113
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.1158442
## RMSE: 0.340359
## LogLoss: 0.3723338
## Mean Per-Class Error: 0.2803854
## AUC: 0.868436
## Gini: 0.7368721
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Closed Open Error Rate
## Closed 165562 165959 0.500599 =165959/331521
## Open 66350 1036320 0.060172 =66350/1102670
## Totals 231912 1202279 0.161979 =232309/1434191
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.462545 0.899213 248
## 2 max f2 0.141606 0.945331 367
## 3 max f0point5 0.770436 0.897762 130
## 4 max accuracy 0.525549 0.839002 226
## 5 max precision 0.951479 0.995525 0
## 6 max recall 0.053853 1.000000 399
## 7 max specificity 0.951479 0.999937 0
## 8 max absolute_mcc 0.710315 0.536213 155
## 9 max min_per_class_accuracy 0.825580 0.791672 102
## 10 max mean_per_class_accuracy 0.830853 0.791855 99
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
Matriz de confusión sobre datos de testing con el paquete caret
confusion_matrix <- caret::confusionMatrix(pred_testing$predict, testing$target)
confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Closed Open
## Closed 592 1356
## Open 1511 17728
##
## Accuracy : 0.8647
## 95% CI : (0.86, 0.8693)
## No Information Rate : 0.9007
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.2176
## Mcnemar's Test P-Value : 0.004026
##
## Sensitivity : 0.28150
## Specificity : 0.92895
## Pos Pred Value : 0.30390
## Neg Pred Value : 0.92146
## Prevalence : 0.09926
## Detection Rate : 0.02794
## Detection Prevalence : 0.09194
## Balanced Accuracy : 0.60522
##
## 'Positive' Class : Closed
##
Y por último, la información de la sesión
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] h2o_3.16.0.2 pROC_1.10.0 chron_2.3-52
## [4] lubridate_1.7.3 ggplot2_2.2.1.9000 dplyr_0.7.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.15 lattice_0.20-35 tidyr_0.8.0
## [4] class_7.3-14 assertthat_0.2.0 rprojroot_1.3-2
## [7] digest_0.6.15 ipred_0.9-6 psych_1.7.8
## [10] foreach_1.4.4 R6_2.2.2 plyr_1.8.4
## [13] backports_1.1.2 stats4_3.4.3 e1071_1.6-8
## [16] evaluate_0.10.1 pillar_1.2.1 rlang_0.2.0.9000
## [19] lazyeval_0.2.1 caret_6.0-78 kernlab_0.9-25
## [22] rpart_4.1-13 Matrix_1.2-12 rmarkdown_1.9
## [25] splines_3.4.3 CVST_0.2-1 ddalpha_1.3.1.1
## [28] gower_0.1.2 stringr_1.3.0 foreign_0.8-69
## [31] RCurl_1.95-4.10 munsell_0.4.3 broom_0.4.3
## [34] compiler_3.4.3 pkgconfig_2.0.1 mnormt_1.5-5
## [37] dimRed_0.1.0 htmltools_0.3.6 tidyselect_0.2.4
## [40] nnet_7.3-12 prodlim_1.6.1 tibble_1.4.2
## [43] DRR_0.0.3 codetools_0.2-15 RcppRoll_0.2.2
## [46] withr_2.1.1.9000 MASS_7.3-49 bitops_1.0-6
## [49] recipes_0.1.2 ModelMetrics_1.1.0 grid_3.4.3
## [52] nlme_3.1-131 jsonlite_1.5 gtable_0.2.0
## [55] magrittr_1.5 scales_0.5.0.9000 stringi_1.1.7
## [58] reshape2_1.4.3 bindrcpp_0.2 timeDate_3043.102
## [61] robustbase_0.92-8 lava_1.6 iterators_1.0.9
## [64] tools_3.4.3 glue_1.2.0 DEoptimR_1.0-8
## [67] purrr_0.2.4 sfsmisc_1.1-2 survival_2.41-3
## [70] parallel_3.4.3 yaml_2.1.18 colorspace_1.3-2
## [73] knitr_1.20 bindr_0.1