代码之家  ›  专栏  ›  技术社区  ›  Lukas Heeren

xgboost-poisson回归:标签必须为非负

  •  3
  • Lukas Heeren  · 技术社区  · 8 年前

    我使用的是windows 10笔记本电脑,R和xgboost版本为0.6-4。运行以下代码时,我遇到了一个奇怪的错误。

    xgb_params <- list("objective" = "count:poisson",
                    "eval_metric" = "rmse")
     regression <- xgboost(data = training_fold, 
                       label = y_training_fold, 
                       nrounds = 10,
                       params = xgb_params)
    
    Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) :
    amalgamation/../src/objective/regression_obj.cc:190: Check failed: 
    label_correct PoissonRegression: label must be nonnegative
    

    Min.   1st Qu. Median  Mean   3rd Qu. Max.   NA's
    0.1129 0.3387  0.7000  1.0987 1.5265  4.5405 287
    

    我怎样才能解决这个问题?我试图删除NA的,但没有帮助。

    编辑

    以下是traindata的示例

    dput(droplevels(head(train[, c(1,2,4,5,6,8,9,10,11)], 20)))
    
    structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
    3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"), 
    VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L, 
    257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L, 
    2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L, 
    3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L), 
    EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L, 
    6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L, 
    22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L, 
    19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L, 
    27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L, 
    7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L, 
    62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L, 
    62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857, 
    8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571, 
    6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714, 
    8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857, 
    8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA, 
    3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556, 
    0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098, 
    0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581, 
    0.790322581, 0.790322581, 0.790322581, 0.790322581)), .Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", "ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", "CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")
    

    希望有人能帮我!

    1 回复  |  直到 6 年前
        1
  •  1
  •   Marco Sandri    8 年前

    数据集中的问题不是数据中是否存在负值 y_training_fold 但存在非整数值。
    y_training_折叠

    library(xgboost)
    
    training_fold <- matrix(rnorm(1000),nrow=100)
    y_training_fold <- matrix(rnorm(100),ncol=1)
    
    xgb_params <- list("objective" = "count:poisson",
                    "eval_metric" = "rmse")
     regression <- xgboost(data = training_fold, 
                       label = y_training_fold, 
                       nrounds = 10,
                       params = xgb_params)
    

    错误消息与您报告的完全相同:

    Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) : 
      [11:46:28] amalgamation/../src/objective/regression_obj.cc:190: 
      Check failed: label_correct PoissonRegression: label must be nonnegative
    

    现在,试试 y_training_折叠

    y_training_fold <- matrix(rpois(100,10),ncol=1)
    
    xgb_params <- list("objective" = "count:poisson",
                    "eval_metric" = "rmse")
    regression <- xgboost(data = training_fold, 
                       label = y_training_fold, 
                       nrounds = 10,
                       params = xgb_params)
    

    xgboost 效果很好:

    [1]     train-rmse:9.795855 
    [2]     train-rmse:9.660112 
    [3]     train-rmse:9.492991 
    [4]     train-rmse:9.287366 
    [5]     train-rmse:9.034582 
    [6]     train-rmse:8.724205 
    [7]     train-rmse:8.343800 
    [8]     train-rmse:7.878869 
    [9]     train-rmse:7.312294 
    [10]    train-rmse:6.632671
    

    编辑。

    dts <- structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
    3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"), 
    VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L, 
    257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L, 
    2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L, 
    3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L), 
    EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L, 
    6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L, 
    22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L, 
    19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L, 
    27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L, 
    7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L, 
    62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L, 
    62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857, 
    8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571, 
    6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714, 
    8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857, 
    8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA, 
    3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556, 
    0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098, 
    0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581, 
    0.790322581, 0.790322581, 0.790322581, 0.790322581)), 
    .Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", 
    "ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", 
    "CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 
    13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), 
    class = "data.frame")
    
    # Delete missing values
    dts <- na.omit(dts)
    
    # Build X matrix of potential predictors
    # Important: do not use the first column (ID) and the last (response variable)
    training_fold <- as.matrix(dts[,-c(1,9)])
    # Round to the nearest integer the response variable
    y_training_fold <- as.matrix(dts[,9])
    y_training_fold <- round(y_training_fold)
    
    xgb_params <- list("objective" = "count:poisson",
                    "eval_metric" = "rmse")
    ( regression <- xgboost(data = training_fold, 
                       label = y_training_fold, 
                       nrounds = 10,
                       params = xgb_params) )
    # Output
    ##### xgb.Booster
    # raw: 4.6 Kb 
    # call:
    #   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
    #     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
    #     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
    #     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
    #     callbacks = callbacks)
    # params (as set within xgb.train):
    #   objective = "count:poisson", eval_metric = "rmse", silent = "1"
    # xgb.attributes:
    #   niter
    # callbacks:
    #   cb.print.evaluation(period = print_every_n)
    #   cb.evaluation.log()
    #   cb.save.model(save_period = save_period, save_name = save_name)
    # niter: 10
    # evaluation_log:
    #     iter train_rmse
    #        1   0.914084
    #        2   0.829741
    # ---                
    #        9   0.332951
    #       10   0.291877