代码之家  ›  专栏  ›  技术社区  ›  gabboshow

两者之间的差异xgboost公司以及sklearn.cross_val_分数

  •  1
  • gabboshow  · 技术社区  · 6 年前

    注意 :我已经看过并尝试过上面写的内容 question 但这并不能解决问题。

    导入库并定义常量

    from sklearn import datasets
    from sklearn import model_selection
    import pandas
    import xgboost
    
    # define random state and seeds
    RANDOM_STATE = 123
    SEEDS = 123
    
    # definition parameters (present XGBRegressor)
    base_score = 0.5
    booster = "gbtree"
    colsample_bylevel = 1
    colsample_bytree = 1
    gamma = 0
    learning_rate = 0.1
    max_delta_step = 0
    max_depth = 5
    min_child_weight = 2
    missing = None
    nestimator = 50
    n_jobs=1
    njobs = 1
    nthread = -1
    objective = "reg:linear"
    random_state = RANDOM_STATE
    reg_alpha = 0
    reg_lambda = 1
    scale_pos_weight = 1
    seed = SEEDS
    silent = True
    subsample = 0.9
    
    # parameters xgboost
    disable_default_eval_metric = 1
    colsample_bynode = 1
    tree_method = "auto"
    
    # for early stopping (we set early stopping = None for simplicity. If not None we have to pass eval_set and eval_metric)
    early_stopping_rounds = None
    eval_set = None
    eval_metric = "mae"
    

    加载波士顿数据集

    # load boston dataset
    dataset_boston = datasets.load_boston()
    
    # extract X, y and transform in DMatrix format
    X = pandas.DataFrame(data=dataset_boston.data, 
                         columns=dataset_boston.feature_names)
    y = pandas.Series(dataset_boston.target, name="PRICE")
    data_dmat = xgboost.DMatrix(data=X, label=y)
    

    # define 3 folds to be used both in xgboost.cv and sklearn.cross_val_score
    folds = model_selection.KFold(n_splits=3, 
                                          shuffle=True, 
                                          random_state=RANDOM_STATE)
    

    与SKLEARN.CROSS_VAL_分数

    # put parameters in dictionary
    param_sklearn = {
        "base_score": base_score,
        "booster": booster,
        "colsample_bylevel": colsample_bylevel,
        "colsample_bytree": colsample_bytree,
        "gamma": gamma,
        "learning_rate": learning_rate, # NOTE: in sklearn is called learning_rate not "eta"
        "max_delta_step": max_delta_step,
        "max_depth": max_depth,
        "min_child_weight": min_child_weight,
        "missing": missing,
        "n_estimators": nestimator, # NOTE: in sklearn are called n_estimators
        "n_jobs":n_jobs,
        "njobs":njobs,
        "nthread":nthread,
        "objective": objective,
        "random_state": RANDOM_STATE,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
        "scale_pos_weight": scale_pos_weight,
        "seed": SEEDS,
        "silent": silent,
        "subsample": subsample,
        # only xgboost
        "disable_default_eval_metric": disable_default_eval_metric,
        "eval_metric": eval_metric,
        "colsample_bynode": colsample_bynode,
        "tree_method": tree_method,
    }
    # initialie estimator 
    estimator = xgboost.sklearn.XGBRegressor(**param_sklearn)
    
    # parameters to pass to the fit method
    fit_params = {
        "sample_weight": None,
        "eval_set": eval_set,
        "eval_metric": eval_metric,
        "early_stopping_rounds": early_stopping_rounds,
        "verbose": False,
        "xgb_model": None,
        "sample_weight_eval_set": None,
    }
    
    # we need to use a negative score because cross_val_score maximizes the score
    scoring = "neg_mean_absolute_error"
    
    results_cross_val_score = model_selection.cross_val_score(
        estimator, X=X, y=y, scoring=scoring, fit_params=fit_params, cv=folds
    )
    
    print(results_cross_val_score.mean())
    

    与XGBOOST.CV

    # xgboost parameters
    params = {
        "base_score": base_score,
        "booster": booster,
        "colsample_bylevel": colsample_bylevel,
        "colsample_bytree": colsample_bytree,
        "gamma": gamma,
        "eta": learning_rate, # NOTE: in python implementation is called eta not "learning_rate"
        "max_delta_step": max_delta_step,
        "max_depth": max_depth,
        "min_child_weight": min_child_weight,
        "missing": missing,
        "n_estimators": nestimator,  # NOT? # NOTE: probably not used since we pass nrounds after
        "n_jobs":n_jobs, # NOT?
        "njobs":njobs, # NOT?
        "nthread":nthread, # NOT?
        "objective": objective, # NOT?
        "random_state": RANDOM_STATE, # NOT?
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
        "scale_pos_weight": scale_pos_weight,
        "seed": SEEDS, # NOT?
        "silent": silent,
        "subsample": subsample,
        # only Xgboost
        "disable_default_eval_metric": disable_default_eval_metric,
        "eval_metric": eval_metric,
        "colsample_bynode": colsample_bynode,
        "tree_method": tree_method,
    }
    
    # xgboost cv
    cv_result = xgboost.cv(
        params=params,
        dtrain=data_dmat,
        num_boost_round=nestimator, # NOTE: number of estimator is set here
        folds=folds,
        # nfold=3, # NOTE: should be overwritten by folds
        # stratified=False, # NOTE: should be overwritten by folds
        metrics=eval_metric,
        obj=None,
        feval=None,
        maximize=False,
        early_stopping_rounds=early_stopping_rounds, # NOTE: set to None to simplify debug
        fpreproc=None,
        as_pandas=True,
        verbose_eval=False,
        show_stdv=False,
        seed=SEEDS,
        callbacks=None,
        # shuffle=True, # NOTE: should be overwritten by folds
    )
    
    results_xgboost_cv = cv_result["test-" + eval_metric + "-mean"].iloc[-1]
    print(results_xgboost_cv)
    

    运行上面的代码,我得到以下结果

    -2.2714486795870408
    2.2897483333333333
    

    你知道他们为什么不同吗(在他们的标志旁边)?

    回答 我想我明白发生了什么。。。。 问题是设置subsample=0.9….XGBoost在种植树木之前随机抽取0.9个训练数据,以防止过度拟合。。。我不知道如何在两个实现中用相同的随机值初始化随机采样。。。设置子样本=1(即不进行任何子采样),结果为

    2.3969616666666664
    2.396961763339098
    
    0 回复  |  直到 6 年前