ctrl <- trainControl(method = "repeatedcv", repeats = 3,  summaryFunction = twoClassSummary)

logitBoostFit <- train(LoanStatus~., credit, method = "LogitBoost", family=binomial, preProcess=c("center", "scale", "pca"), 
    trControl = ctrl)


Warning message:
In train.default(x, y, weights = w, ...): The metric "Accuracy" was not in the result set. ROC will be used instead.Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled performance measures.
Something is wrong; all the ROC metric values are missing:
      ROC           Sens              Spec       
 Min.   : NA   Min.   :0.03496   Min.   :0.9747  
 1st Qu.: NA   1st Qu.:0.03919   1st Qu.:0.9758  
 Median : NA   Median :0.04343   Median :0.9770  
 Mean   :NaN   Mean   :0.04349   Mean   :0.9779  
 3rd Qu.: NA   3rd Qu.:0.04776   3rd Qu.:0.9795  
 Max.   : NA   Max.   :0.05210   Max.   :0.9821  
 NA's   :3                                       
Error in train.default(x, y, weights = w, ...): Stopping


install.packages("pROC", repos="http://cran.rstudio.com/")
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var


'data.frame':   8580 obs. of  45 variables:
 $ ListingCategory            : int  1 7 3 1 1 7 1 1 1 1 ...
 $ IncomeRange                : int  3 4 6 4 4 3 3 4 3 3 ...
 $ StatedMonthlyIncome        : num  2583 4326 10500 4167 5667 ...
 $ IncomeVerifiable           : logi  TRUE TRUE TRUE FALSE TRUE TRUE ...
 $ DTIwProsperLoan            : num  1.8e-01 2.0e-01 1.7e-01 1.0e+06 1.8e-01 4.4e-01 2.2e-01 2.0e-01 2.0e-01 3.1e-01 ...
 $ EmploymentStatusDescription: Factor w/ 7 levels "Employed","Full-time",..: 1 4 1 7 1 1 1 1 1 1 ...
 $ Occupathtml" target="_blank">ion                 : Factor w/ 65 levels "","Accountant/CPA",..: 37 37 20 14 43 58 48 37 37 37 ...
 $ MonthsEmployed             : int  4 44 159 67 26 16 209 147 24 9 ...
 $ BorrowerState              : Factor w/ 48 levels "AK","AL","AR",..: 22 32 5 5 14 28 4 10 10 34 ...
 $ BorrowerCity               : Factor w/ 3089 levels "AARONSBURG","ABERDEEN",..: 1737 3059 2488 654 482 719 895 1699 2747 1903 ...
 $ BorrowerMetropolitanArea   : Factor w/ 1 level "(Not Implemented)": 1 1 1 1 1 1 1 1 1 1 ...
 $ LenderIndicator            : int  0 0 0 1 0 0 0 0 1 0 ...
 $ GroupIndicator             : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
 $ GroupName                  : Factor w/ 83 levels "","00 Used Car Loans",..: 1 1 1 47 1 1 1 1 1 1 ...
 $ ChannelCode                : int  90000 90000 90000 80000 40000 40000 90000 90000 80000 90000 ...
 $ AmountParticipation        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ MonthlyDebt                : int  247 785 1631 817 644 1524 427 817 654 749 ...
 $ CurrentDelinquencies       : int  0 0 0 0 0 0 0 1 0 1 ...
 $ DelinquenciesLast7Years    : int  0 10 0 0 0 0 0 0 0 0 ...
 $ PublicRecordsLast10Years   : int  0 1 0 0 0 0 1 0 1 0 ...
 $ PublicRecordsLast12Months  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ FirstRecordedCreditLine    : Factor w/ 4719 levels "1/1/00 0:00",..: 3032 2673 1197 2541 4698 4345 3150 925 4452 2358 ...
 $ CreditLinesLast7Years      : int  53 30 36 26 7 22 15 20 34 32 ...
 $ InquiriesLast6Months       : int  2 8 5 0 0 0 0 3 0 0 ...
 $ AmountDelinquent           : int  0 0 0 0 0 0 0 63 0 15 ...
 $ CurrentCreditLines         : int  10 10 18 10 4 11 6 10 7 8 ...
 $ OpenCreditLines            : int  9 10 15 8 3 8 5 7 7 8 ...
 $ BankcardUtilization        : num  0.26 0.69 0.94 0.69 0.81 0.38 0.55 0.24 0.03 0 ...
 $ TotalOpenRevolvingAccounts : int  9 7 12 10 3 5 4 5 4 6 ...
 $ InstallmentBalance         : int  48648 14827 0 0 0 30916 0 21619 41340 15447 ...
 $ RealEstateBalance          : int  0 0 577745 0 0 0 191296 0 0 126039 ...
 $ RevolvingBalance           : int  5265 9967 94966 50511 37871 22463 19550 2436 1223 3236 ...
 $ RealEstatePayment          : int  0 0 4159 0 0 0 1303 0 0 1279 ...
 $ RevolvingAvailablePercent  : int  78 52 36 45 18 61 44 74 96 76 ...
 $ TotalInquiries             : int  8 11 15 2 0 0 1 7 1 1 ...
 $ TotalTradeItems            : int  53 30 36 26 7 22 15 20 34 32 ...
 $ SatisfactoryAccounts       : int  52 23 36 26 7 19 15 18 34 29 ...
 $ NowDelinquentDerog         : int  0 0 0 0 0 0 0 1 0 1 ...
 $ WasDelinquentDerog         : int  1 7 0 0 0 3 0 1 0 2 ...
 $ OldestTradeOpenDate        : int  5092001 5011977 12011984 4272000 9081993 9122000 6161987 11181999 9191990 4132000 ...
 $ DelinquenciesOver30Days    : int  0 6 0 0 0 13 0 2 0 2 ...
 $ DelinquenciesOver60Days    : int  0 4 0 0 0 0 0 0 0 1 ...
 $ DelinquenciesOver90Days    : int  0 10 0 0 0 0 0 0 0 0 ...
 $ IsHomeowner                : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
 $ LoanStatus                 : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 2 1 .`..

structure(list(ListingCategory = c(1L, 7L, 3L, 1L), IncomeRange = c(3L, 
4L, 6L, 4L), StatedMonthlyIncome = c(2583.3333, 4326, 10500, 
4166.6667), IncomeVerifiable = c(TRUE, TRUE, TRUE, FALSE), DTIwProsperLoan = c(0.18, 
0.2, 0.17, 1e+06), EmploymentStatusDescription = structure(c(1L, 
4L, 1L, 7L), .Label = c("Employed", "Full-time", "Not employed", 
"Other", "Part-time", "Retired", "Self-employed"), class = "factor"), 
    MonthsEmployed = c(4L, 44L, 159L, 67L), BorrowerState = structure(c(22L, 
    32L, 5L, 5L), .Label = c("AK", "AL", "AR", "AZ", "CA", "CO", 
    "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "KS", 
    "KY", "LA", "MA", "MD", "MI", "MN", "MO", "MS", "MT", "NC", 
    "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", 
    "WV", "WY"), class = "factor"), LenderIndicator = c(0L, 0L, 
    0L, 1L), GroupIndicator = c(FALSE, FALSE, FALSE, TRUE), ChannelCode = c(90000L, 
    90000L, 90000L, 80000L), MonthlyDebt = c(247L, 785L, 1631L, 
    817L), CurrentDelinquencies = c(0L, 0L, 0L, 0L), DelinquenciesLast7Years = c(0L, 
    10L, 0L, 0L), PublicRecordsLast10Years = c(0L, 1L, 0L, 0L
    ), PublicRecordsLast12Months = c(0L, 0L, 0L, 0L), CreditLinesLast7Years = c(53L, 
    30L, 36L, 26L), InquiriesLast6Months = c(2L, 8L, 5L, 0L), 
    AmountDelinquent = c(0L, 0L, 0L, 0L), CurrentCreditLines = c(10L, 
    10L, 18L, 10L), OpenCreditLines = c(9L, 10L, 15L, 8L), BankcardUtilization = c(0.26, 
    0.69, 0.94, 0.69), TotalOpenRevolvingAccounts = c(9L, 7L, 
    12L, 10L), InstallmentBalance = c(48648L, 14827L, 0L, 0L), 
    RealEstateBalance = c(0L, 0L, 577745L, 0L), RevolvingBalance = c(5265L, 
    9967L, 94966L, 50511L), RealEstatePayment = c(0L, 0L, 4159L, 
    0L), RevolvingAvailablePercent = c(78L, 52L, 36L, 45L), TotalInquiries = c(8L, 
    11L, 15L, 2L), TotalTradeItems = c(53L, 30L, 36L, 26L), SatisfactoryAccounts = c(52L, 
    23L, 36L, 26L), NowDelinquentDerog = c(0L, 0L, 0L, 0L), WasDelinquentDerog = c(1L, 
    7L, 0L, 0L), OldestTradeOpenDate = c(5092001L, 5011977L, 
    12011984L, 4272000L), DelinquenciesOver30Days = c(0L, 6L, 
    0L, 0L), DelinquenciesOver60Days = c(0L, 4L, 0L, 0L), DelinquenciesOver90Days = c(0L, 
    10L, 0L, 0L), IsHomeowner = c(FALSE, FALSE, TRUE, FALSE), 
    LoanStatus = structure(c(2L, 1L, 1L, 2L), .Label = c("0", 
    "1"), class = "factor")), .Names = c("ListingCategory", "IncomeRange", 
"StatedMonthlyIncome", "IncomeVerifiable", "DTIwProsperLoan", 
"EmploymentStatusDescription", "MonthsEmployed", "BorrowerState", 
"LenderIndicator", "GroupIndicator", "ChannelCode", "MonthlyDebt", 
"CurrentDelinquencies", "DelinquenciesLast7Years", "PublicRecordsLast10Years", 
"PublicRecordsLast12Months", "CreditLinesLast7Years", "InquiriesLast6Months", 
"AmountDelinquent", "CurrentCreditLines", "OpenCreditLines", 
"BankcardUtilization", "TotalOpenRevolvingAccounts", "InstallmentBalance", 
"RealEstateBalance", "RevolvingBalance", "RealEstatePayment", 
"RevolvingAvailablePercent", "TotalInquiries", "TotalTradeItems", 
"SatisfactoryAccounts", "NowDelinquentDerog", "WasDelinquentDerog", 
"OldestTradeOpenDate", "DelinquenciesOver30Days", "DelinquenciesOver60Days", 
"DelinquenciesOver90Days", "IsHomeowner", "LoanStatus"), row.names = c(NA, 
4L), class = "data.frame")


Warning message:
In train.default(x, y, weights = w, ...): The metric "Accuracy" was not in the result set. ROC will be used instead.
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3540.667624
iter  20 value 3329.692768
iter  30 value 3279.191024
iter  40 value 3264.926986
iter  50 value 3259.276647
iter  60 value 3259.056261
final  value 3259.032668 
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3540.774666
iter  20 value 3330.016829
iter  30 value 3279.545595
iter  40 value 3265.384385
iter  50 value 3259.499032
iter  60 value 3259.353010
final  value 3259.342601 
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3540.667731
iter  20 value 3329.693092
iter  30 value 3279.191379
iter  40 value 3264.927427
iter  50 value 3259.276899
iter  60 value 3259.056561
final  value 3259.032978 
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3528.401458
iter  20 value 3314.932958
iter  30 value 3264.117072
iter  40 value 3253.780051
iter  50 value 3253.368959
iter  60 value 3253.359047
final  value 3253.358819 
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3528.508505
iter  20 value 3315.134599
iter  30 value 3265.021404
iter  40 value 3255.739021
iter  50 value 3253.817833
iter  60 value 3253.697180
final  value 3253.671003 
# weights:  72 (71 variable)
initial  value 5144.538374 
iter  10 value 3528.401565
iter  20 value 3314.933160
iter  30 value 3264.117768
iter  40 value 3253.780539
iter  50 value 3253.369030
iter  60 value 3253.359358
final  value 3253.359133 
# weights:  71 (70 variable)
initial  value 5145.231521 
iter  10 value 4680.326236
iter  20 value 4672.506024
iter  30 value 3662.998233
iter  40 value 3310.207744
iter  50 value 3252.983656
iter  60 value 3250.400275
iter  70 value 3250.339216
final  value 3250.332646 

权重:72(71个变量)初始值5144.538374 iter 10值4661.569290 iter 20值4652.246624 iter 30值3715.472355 iter 40值3484.096833 iter 50值3254.247424 iter 60值3248.931841 iter 70值3248.154679 iter 80值3248.;129089 iter八十值32484663.660886 iter 20值4654.255466 iter 30值3542.473235 iter 40值3315.027437 iter 50值3250.340679 iter 60值3248.693378 iter 70值3248.455840 iter 80值3248.;443345 iter 80价值3248.443325 iter 80数值3248.445325最终数值3248 30价值4397.069608 iter 40值3532.067046 iter 50值3283.179445 iter 60值3249.518694 iter 70值3248.163057 iter 80值3248.129552最终值3248.;128889聚合警告消息:在nominalTrainWorkflow中(x=x,y=y,wts=weights,info=trainInfo,:在重新采样的性能度量中缺少值。出了问题;所有ROC度量值都缺少:ROC Sens Spec


ctrl <- trainControl(method = "cv", summaryFunction = twoClassSummary)

multinomSummaryFit <- train(LoanStatus~., credit, method = "multinom", family=binomial, 
    trControl = ctrl)

Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled performance measures.
Something is wrong; all the ROC metric values are missing:
      ROC           Sens              Spec       
 Min.   : NA   Min.   :0.01919   Min.   :0.9941  
 1st Qu.: NA   1st Qu.:0.01988   1st Qu.:0.9942  
 Median : NA   Median :0.02056   Median :0.9943  
 Mean   :NaN   Mean   :0.02011   Mean   :0.9943  
 3rd Qu.: NA   3rd Qu.:0.02056   3rd Qu.:0.9943  
 Max.   : NA   Max.   :0.02057   Max.   :0.9944  
 NA's   :3                                       
Error in train.default(x, y, weights = w, ...): Stopping









变量<code>MonthsEmployed</code>具有<code>5 NA</code>值:

Min.   :-23.00  
1st Qu.: 26.00  
Median : 68.00 
Mean   : 97.44  
3rd Qu.:139.00  
Max.   :755.00  
NA's   :5  

变量 InstallmentBalance328 个 NA 值。

Min.   :     0  
1st Qu.:  3338       
Median : 14453       
Mean   : 24900       
3rd Qu.: 32238      
Max.   :739371    
NA's   :328     



ctrl <- trainControl(method = "repeatedcv", 
                     repeats = 3, 
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary) . 


multinomSummaryFit <- train(LoanStatus~., 
                            data = credit, 
                            method = "multinom", 
                            metric = "ROC",
                            trControl = ctrl)

关于数据集的另一个重要问题是,您需要仔细检查变量的值,并确保每个值都有意义。例如,月已用变量具有负值。从逻辑上讲,员工的就业月数为正数。这些负值是错误的,还是它们意味着别的什么!(例如,值为 -23 表示该人已有 23 个月未受雇)。

回答您关于< code >混淆矩阵的问题:


#let's say your test datafrme is called test
mymodel_pred <- predict(multinomSummaryFit, test[, names(test) != "LoanStatus"])


confusionMatrix(data = mymodel_pred, 
                reference = test$LoanStatus, 
                positive = "Default")

如果测试数据集没有 LoanStatus 列,则只需使用:

mymodel_pred <- predict(multinomSummaryFit, test)


请记住,如果您从定型数据集中移除了任何变量,那么在调用< code>predict之前,也需要从测试数据集中移除这些变量


trainingRows <- createDataPartition(credit$LoanStatus, p = .70, list= FALSE)
train <- credit[trainingRows, ]
test <- credit[-trainingRows, ]
