Diabetes in Pima Indian Women

Characteristics of Pima Indian women tested for diabetes are used in this example to predict their disease statuses.

Characteristic Value
Number of women 392
pos 130 (33.16%)
neg 262 (66.84%)
Median (Range) 2 (0, 17)
Median (Range) 119 (56, 198)
Median (Range) 70 (24, 110)
Median (Range) 29 (7, 63)
Median (Range) 125.5 (14, 846)
Median (Range) 33.2 (18.2, 67.1)
Median (Range) 0.4495 (0.085, 2.420)
Median (Range) 27 (21, 81)

Training Set Analysis

## Analysis libraries

## Parallel processing
registerDoParallel(cores = 6)

## Dataset
data(PimaIndiansDiabetes2, package = "mlbench")
Pima <- na.omit(PimaIndiansDiabetes2)

## Model formula
fo <- diabetes ~ .

## Model selected from tuned models
selected_model <- SelectedModel(TunedModel(KNNModel, grid = 5),
                                TunedModel(NNetModel, grid = 5),
                                TunedModel(RandomForestModel, grid = 5))

## Model fit
model_fit <- fit(fo, data = Pima, model = selected_model)

## Variable importance
vi <- varimp(model_fit)

Generalization Performance

## Resample estimation with cross-validation
res <- resample(fo, data = Pima, model = selected_model, control = CVControl)

## Estimated performance
#>              Statistic
#> Metric             Mean    Median         SD       Min       Max NA
#>   Brier       0.1441490 0.1385509 0.04313255 0.1060426 0.2562013  0
#>   Accuracy    0.7857051 0.7974359 0.06529521 0.6410256 0.8717949  0
#>   Kappa       0.4999248 0.5123220 0.15928724 0.1250000 0.6808511  0
#>   ROC AUC     0.8531339 0.8786982 0.08404156 0.6375740 0.9186391  0
#>   Sensitivity 0.6153846 0.6153846 0.13074409 0.3076923 0.7692308  0
#>   Specificity 0.8700855 0.8682336 0.06074830 0.8076923 1.0000000  0

## Variable probability cutoff
summary(performance(res, cutoff = 0.25))
#>              Statistic
#> Metric             Mean    Median         SD       Min       Max NA
#>   Brier       0.1441490 0.1385509 0.04313255 0.1060426 0.2562013  0
#>   Accuracy    0.7499359 0.7435897 0.06878381 0.6410256 0.8500000  0
#>   Kappa       0.5013070 0.5080645 0.14027061 0.2500000 0.6946565  0
#>   ROC AUC     0.8531339 0.8786982 0.08404156 0.6375740 0.9186391  0
#>   Sensitivity 0.8846154 0.9230769 0.12162606 0.6153846 1.0000000  0
#>   Specificity 0.6831909 0.6730769 0.06161579 0.5925926 0.7777778  0

ROC Curve

## True positive and false positive rates over all probability cutoffs
roc <- performance_curve(res)

## ROC curve
plot(roc, diagonal = TRUE) + coord_fixed()

## Area under the curve
#> Model: SelectedModel
#> [1] 0.8464381

Confusion Matrices

(conf <- confusion(res))
#> --- ConfusionList object ----------------------------------------------------------------------
#> === $SelectedModel ============================================================================
#> === BinaryConfusionMatrix object ===
#>          Observed
#> Predicted neg pos
#>       neg 228  50
#>       pos  34  80

#> --- $SelectedModel ----------------------------------------------------------------------------
#> Number of responses: 392
#> Accuracy (SE): 0.7857143 (0.02072459)
#> Majority class: 0.6683673
#> Kappa: 0.5011514
#>                   neg       pos
#> Observed    0.6683673 0.3316327
#> Predicted   0.7091837 0.2908163
#> Agreement   0.5816327 0.2040816
#> Sensitivity 0.8702290 0.6153846
#> Specificity 0.6153846 0.8702290
#> PPV         0.8201439 0.7017544
#> NPV         0.7017544 0.8201439

#> $SelectedModel

## Variable probability cutoff
summary(confusion(res, cutoff = 0.25))
#> --- $SelectedModel ----------------------------------------------------------------------------
#> Number of responses: 392
#> Accuracy (SE): 0.75 (0.02187044)
#> Majority class: 0.6683673
#> Kappa: 0.5017122
#>                   neg       pos
#> Observed    0.6683673 0.3316327
#> Predicted   0.4948980 0.5051020
#> Agreement   0.4566327 0.2933673
#> Sensitivity 0.6832061 0.8846154
#> Specificity 0.8846154 0.6832061
#> PPV         0.9226804 0.5808081
#> NPV         0.5808081 0.9226804

Calibration Curve

cal <- calibration(res, breaks = NULL)
plot(cal, se = TRUE)

Partial Dependence Plots

pd <- dependence(model_fit, select = c(glucose, age, insulin))