Cross_val_score不适用于roc_auc和多类

我想计算一个cross_val_score使用roc_auc多类问题

这是使用虹膜数据集制作的可复制示例。

from sklearn.datasets import load_iris

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score

iris = load_iris()

X = pd.DataFrame(data=iris.data, columns=iris.feature_names)

我一个热编码我的目标

encoder = OneHotEncoder()

y = encoder.fit_transform(pd.DataFrame(iris.target)).toarray()

我使用决策树分类器

model = DecisionTreeClassifier(max_depth=1)

最后我执行交叉val

cross_val_score(model, X, y, cv=3, scoring="roc_auc")

最后一行抛出以下错误

---------------------------------------------------------------------------

ValueError Traceback (most recent call last)

<ipython-input-87-91dc6fa67512> in <module>()

----> 1 cross_val_score(model, X, y, cv=3, scoring="roc_auc")

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)

340 n_jobs=n_jobs, verbose=verbose,

341 fit_params=fit_params,

--> 342 pre_dispatch=pre_dispatch)

343 return cv_results['test_score']

344

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)

204 fit_params, return_train_score=return_train_score,

205 return_times=True)

--> 206 for train, test in cv.split(X, y, groups))

207

208 if return_train_score:

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)

777 # was dispatched. In particular this covers the edge

778 # case of Parallel used with an exhausted iterator.

--> 779 while self.dispatch_one_batch(iterator):

780 self._iterating = True

781 else:

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)

623 return False

624 else:

--> 625 self._dispatch(tasks)

626 return True

627

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)

586 dispatch_timestamp = time.time()

587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)

--> 588 job = self._backend.apply_async(batch, callback=cb)

589 self._jobs.append(job)

590

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)

109 def apply_async(self, func, callback=None):

110 """Schedule a func to be run"""

--> 111 result = ImmediateResult(func)

112 if callback:

113 callback(result)

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)

330 # Don't delay the application, to avoid keeping the input

331 # arguments in memory

--> 332 self.results = batch()

333

334 def get(self):

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)

129

130 def __call__(self):

--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]

132

133 def __len__(self):

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)

129

130 def __call__(self):

--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]

132

133 def __len__(self):

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)

486 fit_time = time.time() - start_time

487 # _score will return dict if is_multimetric is True

--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)

489 score_time = time.time() - start_time - fit_time

490 if return_train_score:

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)

521 """

522 if is_multimetric:

--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)

524 else:

525 if y_test is None:

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)

551 score = scorer(estimator, X_test)

552 else:

--> 553 score = scorer(estimator, X_test, y_test)

554

555 if hasattr(score, 'item'):

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)

204 **self._kwargs)

205 else:

--> 206 return self._sign * self._score_func(y, y_pred, **self._kwargs)

207

208 def _factory_args(self):

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/metrics/ranking.py in roc_auc_score(y_true, y_score, average, sample_weight)

275 return _average_binary_score(

276 _binary_roc_auc_score, y_true, y_score, average,

--> 277 sample_weight=sample_weight)

278

279

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/metrics/base.py in _average_binary_score(binary_metric, y_true, y_score, average, sample_weight)

116 y_score_c = y_score.take([c], axis=not_average_axis).ravel()

117 score[c] = binary_metric(y_true_c, y_score_c,

--> 118 sample_weight=score_weight)

119

120 # Average the results

~/programs/anaconda3/lib/python3.7/site-packages/sklearn/metrics/ranking.py in _binary_roc_auc_score(y_true, y_score, sample_weight)

266 def _binary_roc_auc_score(y_true, y_score, sample_weight=None):

267 if len(np.unique(y_true)) != 2:

--> 268 raise ValueError("Only one class present in y_true. ROC AUC score "

269 "is not defined in that case.")

270

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

python == 3.7.2

sklearn == 0.19.2

是一个错误,还是我在误用?

回答:

scikit-learn的交叉验证功能不必要的麻烦在于,默认情况下,不会对数据进行 混洗 ;最好将改组设置为默认选项-

当然,这将首先假设改组参数首先可用cross_val_score,但不幸的是,它不是(docs)。

所以,这就是正在发生的事情;对虹膜数据集的150个样本进行了 分层

iris.target[0:50]

# result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0])

iris.target[50:100]

# result:

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1])

iris.target[100:150]

# result:

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2, 2, 2, 2, 2])

现在,一个三重CV过程具有150个样本,如上所示分层,并显示一条错误消息:

ValueError: Only one class present in y_true

应该希望开始有意义:您的3个验证折叠中的每一个仅存在一个标签,因此不可能进行ROC计算(更不用说在每个验证折叠中模型看到相应训练折叠中看不到的标签的事实)。

因此,只需先洗牌就可以:

from sklearn.utils import shuffle

X_s, y_s = shuffle(X, y)

cross_val_score(model, X_s, y_s, cv=3, scoring="roc_auc")

你应该没事的

以上是 Cross_val_score不适用于roc_auc和多类 的全部内容, 来源链接: utcz.com/qa/401323.html

回到顶部