1import numpy as np
2from scipy import optimize
3from numpy.testing import assert_allclose
4from scipy.special import factorial, xlogy
5from itertools import product
6import pytest
7
8from sklearn.utils._testing import assert_almost_equal
9from sklearn.utils._testing import assert_array_equal
10from sklearn.utils._testing import assert_array_almost_equal
11from sklearn.dummy import DummyRegressor
12from sklearn.model_selection import GridSearchCV
13
14from sklearn.metrics import explained_variance_score
15from sklearn.metrics import mean_absolute_error
16from sklearn.metrics import mean_squared_error
17from sklearn.metrics import mean_squared_log_error
18from sklearn.metrics import median_absolute_error
19from sklearn.metrics import mean_absolute_percentage_error
20from sklearn.metrics import max_error
21from sklearn.metrics import mean_pinball_loss
22from sklearn.metrics import r2_score
23from sklearn.metrics import mean_tweedie_deviance
24from sklearn.metrics import d2_tweedie_score
25from sklearn.metrics import make_scorer
26
27from sklearn.metrics._regression import _check_reg_targets
28
29from sklearn.exceptions import UndefinedMetricWarning
30
31
32def test_regression_metrics(n_samples=50):
33    y_true = np.arange(n_samples)
34    y_pred = y_true + 1
35    y_pred_2 = y_true - 1
36
37    assert_almost_equal(mean_squared_error(y_true, y_pred), 1.0)
38    assert_almost_equal(
39        mean_squared_log_error(y_true, y_pred),
40        mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred)),
41    )
42    assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.0)
43    assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5)
44    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5)
45    assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6)
46    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4)
47    assert_almost_equal(median_absolute_error(y_true, y_pred), 1.0)
48    mape = mean_absolute_percentage_error(y_true, y_pred)
49    assert np.isfinite(mape)
50    assert mape > 1e6
51    assert_almost_equal(max_error(y_true, y_pred), 1.0)
52    assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2)
53    assert_almost_equal(explained_variance_score(y_true, y_pred), 1.0)
54    assert_almost_equal(
55        mean_tweedie_deviance(y_true, y_pred, power=0),
56        mean_squared_error(y_true, y_pred),
57    )
58    assert_almost_equal(
59        d2_tweedie_score(y_true, y_pred, power=0), r2_score(y_true, y_pred)
60    )
61
62    # Tweedie deviance needs positive y_pred, except for p=0,
63    # p>=2 needs positive y_true
64    # results evaluated by sympy
65    y_true = np.arange(1, 1 + n_samples)
66    y_pred = 2 * y_true
67    n = n_samples
68    assert_almost_equal(
69        mean_tweedie_deviance(y_true, y_pred, power=-1),
70        5 / 12 * n * (n ** 2 + 2 * n + 1),
71    )
72    assert_almost_equal(
73        mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2))
74    )
75    assert_almost_equal(
76        mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1
77    )
78    assert_almost_equal(
79        mean_tweedie_deviance(y_true, y_pred, power=3 / 2),
80        ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum(),
81    )
82    assert_almost_equal(
83        mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n)
84    )
85
86    dev_mean = 2 * np.mean(xlogy(y_true, 2 * y_true / (n + 1)))
87    assert_almost_equal(
88        d2_tweedie_score(y_true, y_pred, power=1),
89        1 - (n + 1) * (1 - np.log(2)) / dev_mean,
90    )
91
92    dev_mean = 2 * np.log((n + 1) / 2) - 2 / n * np.log(factorial(n))
93    assert_almost_equal(
94        d2_tweedie_score(y_true, y_pred, power=2), 1 - (2 * np.log(2) - 1) / dev_mean
95    )
96
97
98def test_mean_squared_error_multioutput_raw_value_squared():
99    # non-regression test for
100    # https://github.com/scikit-learn/scikit-learn/pull/16323
101    mse1 = mean_squared_error([[1]], [[10]], multioutput="raw_values", squared=True)
102    mse2 = mean_squared_error([[1]], [[10]], multioutput="raw_values", squared=False)
103    assert np.sqrt(mse1) == pytest.approx(mse2)
104
105
106def test_multioutput_regression():
107    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
108    y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]])
109
110    error = mean_squared_error(y_true, y_pred)
111    assert_almost_equal(error, (1.0 / 3 + 2.0 / 3 + 2.0 / 3) / 4.0)
112
113    error = mean_squared_error(y_true, y_pred, squared=False)
114    assert_almost_equal(error, 0.454, decimal=2)
115
116    error = mean_squared_log_error(y_true, y_pred)
117    assert_almost_equal(error, 0.200, decimal=2)
118
119    # mean_absolute_error and mean_squared_error are equal because
120    # it is a binary problem.
121    error = mean_absolute_error(y_true, y_pred)
122    assert_almost_equal(error, (1.0 + 2.0 / 3) / 4.0)
123
124    error = mean_pinball_loss(y_true, y_pred)
125    assert_almost_equal(error, (1.0 + 2.0 / 3) / 8.0)
126
127    error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2)
128    assert np.isfinite(error)
129    assert error > 1e6
130    error = median_absolute_error(y_true, y_pred)
131    assert_almost_equal(error, (1.0 + 1.0) / 4.0)
132
133    error = r2_score(y_true, y_pred, multioutput="variance_weighted")
134    assert_almost_equal(error, 1.0 - 5.0 / 2)
135    error = r2_score(y_true, y_pred, multioutput="uniform_average")
136    assert_almost_equal(error, -0.875)
137
138
139def test_regression_metrics_at_limits():
140    assert_almost_equal(mean_squared_error([0.0], [0.0]), 0.0)
141    assert_almost_equal(mean_squared_error([0.0], [0.0], squared=False), 0.0)
142    assert_almost_equal(mean_squared_log_error([0.0], [0.0]), 0.0)
143    assert_almost_equal(mean_absolute_error([0.0], [0.0]), 0.0)
144    assert_almost_equal(mean_pinball_loss([0.0], [0.0]), 0.0)
145    assert_almost_equal(mean_absolute_percentage_error([0.0], [0.0]), 0.0)
146    assert_almost_equal(median_absolute_error([0.0], [0.0]), 0.0)
147    assert_almost_equal(max_error([0.0], [0.0]), 0.0)
148    assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0)
149    assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0)
150    msg = (
151        "Mean Squared Logarithmic Error cannot be used when targets "
152        "contain negative values."
153    )
154    with pytest.raises(ValueError, match=msg):
155        mean_squared_log_error([-1.0], [-1.0])
156    msg = (
157        "Mean Squared Logarithmic Error cannot be used when targets "
158        "contain negative values."
159    )
160    with pytest.raises(ValueError, match=msg):
161        mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0])
162    msg = (
163        "Mean Squared Logarithmic Error cannot be used when targets "
164        "contain negative values."
165    )
166    with pytest.raises(ValueError, match=msg):
167        mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0])
168
169    # Tweedie deviance error
170    power = -1.2
171    assert_allclose(
172        mean_tweedie_deviance([0], [1.0], power=power), 2 / (2 - power), rtol=1e-3
173    )
174    msg = "can only be used on strictly positive y_pred."
175    with pytest.raises(ValueError, match=msg):
176        mean_tweedie_deviance([0.0], [0.0], power=power)
177    with pytest.raises(ValueError, match=msg):
178        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
179
180    assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.0, 2)
181
182    power = 1.0
183    msg = "only be used on non-negative y and strictly positive y_pred."
184    with pytest.raises(ValueError, match=msg):
185        mean_tweedie_deviance([0.0], [0.0], power=power)
186    with pytest.raises(ValueError, match=msg):
187        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
188
189    power = 1.5
190    assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power))
191    msg = "only be used on non-negative y and strictly positive y_pred."
192    with pytest.raises(ValueError, match=msg):
193        mean_tweedie_deviance([0.0], [0.0], power=power)
194    with pytest.raises(ValueError, match=msg):
195        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
196
197    power = 2.0
198    assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
199    msg = "can only be used on strictly positive y and y_pred."
200    with pytest.raises(ValueError, match=msg):
201        mean_tweedie_deviance([0.0], [0.0], power=power)
202    with pytest.raises(ValueError, match=msg):
203        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
204
205    power = 3.0
206    assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8)
207    msg = "can only be used on strictly positive y and y_pred."
208    with pytest.raises(ValueError, match=msg):
209        mean_tweedie_deviance([0.0], [0.0], power=power)
210    with pytest.raises(ValueError, match=msg):
211        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
212
213    power = 0.5
214    with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
215        mean_tweedie_deviance([0.0], [0.0], power=power)
216    with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"):
217        d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power)
218
219
220def test__check_reg_targets():
221    # All of length 3
222    EXAMPLES = [
223        ("continuous", [1, 2, 3], 1),
224        ("continuous", [[1], [2], [3]], 1),
225        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
226        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
227        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
228    ]
229
230    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2):
231
232        if type1 == type2 and n_out1 == n_out2:
233            y_type, y_check1, y_check2, multioutput = _check_reg_targets(y1, y2, None)
234            assert type1 == y_type
235            if type1 == "continuous":
236                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
237                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
238            else:
239                assert_array_equal(y_check1, y1)
240                assert_array_equal(y_check2, y2)
241        else:
242            with pytest.raises(ValueError):
243                _check_reg_targets(y1, y2, None)
244
245
246def test__check_reg_targets_exception():
247    invalid_multioutput = "this_value_is_not_valid"
248    expected_message = (
249        "Allowed 'multioutput' string values are.+You provided multioutput={!r}".format(
250            invalid_multioutput
251        )
252    )
253    with pytest.raises(ValueError, match=expected_message):
254        _check_reg_targets([1, 2, 3], [[1], [2], [3]], invalid_multioutput)
255
256
257def test_regression_multioutput_array():
258    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
259    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
260
261    mse = mean_squared_error(y_true, y_pred, multioutput="raw_values")
262    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
263    err_msg = (
264        "multioutput is expected to be 'raw_values' "
265        "or 'uniform_average' but we got 'variance_weighted' instead."
266    )
267    with pytest.raises(ValueError, match=err_msg):
268        mean_pinball_loss(y_true, y_pred, multioutput="variance_weighted")
269    pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values")
270    mape = mean_absolute_percentage_error(y_true, y_pred, multioutput="raw_values")
271    r = r2_score(y_true, y_pred, multioutput="raw_values")
272    evs = explained_variance_score(y_true, y_pred, multioutput="raw_values")
273
274    assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
275    assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
276    assert_array_almost_equal(pbl, [0.25 / 2, 0.625 / 2], decimal=2)
277    assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2)
278    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
279    assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)
280
281    # mean_absolute_error and mean_squared_error are equal because
282    # it is a binary problem.
283    y_true = [[0, 0]] * 4
284    y_pred = [[1, 1]] * 4
285    mse = mean_squared_error(y_true, y_pred, multioutput="raw_values")
286    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
287    pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values")
288    r = r2_score(y_true, y_pred, multioutput="raw_values")
289    assert_array_almost_equal(mse, [1.0, 1.0], decimal=2)
290    assert_array_almost_equal(mae, [1.0, 1.0], decimal=2)
291    assert_array_almost_equal(pbl, [0.5, 0.5], decimal=2)
292    assert_array_almost_equal(r, [0.0, 0.0], decimal=2)
293
294    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values")
295    assert_array_almost_equal(r, [0, -3.5], decimal=2)
296    assert np.mean(r) == r2_score(
297        [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="uniform_average"
298    )
299    evs = explained_variance_score(
300        [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values"
301    )
302    assert_array_almost_equal(evs, [0, -1.25], decimal=2)
303
304    # Checking for the condition in which both numerator and denominator is
305    # zero.
306    y_true = [[1, 3], [-1, 2]]
307    y_pred = [[1, 4], [-1, 1]]
308    r2 = r2_score(y_true, y_pred, multioutput="raw_values")
309    assert_array_almost_equal(r2, [1.0, -3.0], decimal=2)
310    assert np.mean(r2) == r2_score(y_true, y_pred, multioutput="uniform_average")
311    evs = explained_variance_score(y_true, y_pred, multioutput="raw_values")
312    assert_array_almost_equal(evs, [1.0, -3.0], decimal=2)
313    assert np.mean(evs) == explained_variance_score(y_true, y_pred)
314
315    # Handling msle separately as it does not accept negative inputs.
316    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
317    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
318    msle = mean_squared_log_error(y_true, y_pred, multioutput="raw_values")
319    msle2 = mean_squared_error(
320        np.log(1 + y_true), np.log(1 + y_pred), multioutput="raw_values"
321    )
322    assert_array_almost_equal(msle, msle2, decimal=2)
323
324
325def test_regression_custom_weights():
326    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
327    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]
328
329    msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6])
330    rmsew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6], squared=False)
331    maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6])
332    mapew = mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.4, 0.6])
333    rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6])
334    evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6])
335
336    assert_almost_equal(msew, 0.39, decimal=2)
337    assert_almost_equal(rmsew, 0.59, decimal=2)
338    assert_almost_equal(maew, 0.475, decimal=3)
339    assert_almost_equal(mapew, 0.1668, decimal=2)
340    assert_almost_equal(rw, 0.94, decimal=2)
341    assert_almost_equal(evsw, 0.94, decimal=2)
342
343    # Handling msle separately as it does not accept negative inputs.
344    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
345    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
346    msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
347    msle2 = mean_squared_error(
348        np.log(1 + y_true), np.log(1 + y_pred), multioutput=[0.3, 0.7]
349    )
350    assert_almost_equal(msle, msle2, decimal=2)
351
352
353@pytest.mark.parametrize("metric", [r2_score, d2_tweedie_score])
354def test_regression_single_sample(metric):
355    y_true = [0]
356    y_pred = [1]
357    warning_msg = "not well-defined with less than two samples."
358
359    # Trigger the warning
360    with pytest.warns(UndefinedMetricWarning, match=warning_msg):
361        score = metric(y_true, y_pred)
362        assert np.isnan(score)
363
364
365def test_deprecation_positional_arguments_mape():
366    y_true = [1, 1, 1]
367    y_pred = [1, 0, 1]
368    sample_weights = [0.5, 0.1, 0.2]
369    multioutput = "raw_values"
370
371    warning_msg = "passing these as positional arguments will result in an error"
372
373    # Trigger the warning
374    with pytest.warns(FutureWarning, match=warning_msg):
375        mean_absolute_percentage_error(y_true, y_pred, sample_weights, multioutput)
376
377
378def test_tweedie_deviance_continuity():
379    n_samples = 100
380
381    y_true = np.random.RandomState(0).rand(n_samples) + 0.1
382    y_pred = np.random.RandomState(1).rand(n_samples) + 0.1
383
384    assert_allclose(
385        mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10),
386        mean_tweedie_deviance(y_true, y_pred, power=0),
387    )
388
389    # Ws we get closer to the limit, with 1e-12 difference the absolute
390    # tolerance to pass the below check increases. There are likely
391    # numerical precision issues on the edges of different definition
392    # regions.
393    assert_allclose(
394        mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10),
395        mean_tweedie_deviance(y_true, y_pred, power=1),
396        atol=1e-6,
397    )
398
399    assert_allclose(
400        mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10),
401        mean_tweedie_deviance(y_true, y_pred, power=2),
402        atol=1e-6,
403    )
404
405    assert_allclose(
406        mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10),
407        mean_tweedie_deviance(y_true, y_pred, power=2),
408        atol=1e-6,
409    )
410
411
412def test_mean_absolute_percentage_error():
413    random_number_generator = np.random.RandomState(42)
414    y_true = random_number_generator.exponential(size=100)
415    y_pred = 1.2 * y_true
416    assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2)
417
418
419@pytest.mark.parametrize(
420    "distribution", ["normal", "lognormal", "exponential", "uniform"]
421)
422@pytest.mark.parametrize("target_quantile", [0.05, 0.5, 0.75])
423def test_mean_pinball_loss_on_constant_predictions(distribution, target_quantile):
424    if not hasattr(np, "quantile"):
425        pytest.skip(
426            "This test requires a more recent version of numpy "
427            "with support for np.quantile."
428        )
429
430    # Check that the pinball loss is minimized by the empirical quantile.
431    n_samples = 3000
432    rng = np.random.RandomState(42)
433    data = getattr(rng, distribution)(size=n_samples)
434
435    # Compute the best possible pinball loss for any constant predictor:
436    best_pred = np.quantile(data, target_quantile)
437    best_constant_pred = np.full(n_samples, fill_value=best_pred)
438    best_pbl = mean_pinball_loss(data, best_constant_pred, alpha=target_quantile)
439
440    # Evaluate the loss on a grid of quantiles
441    candidate_predictions = np.quantile(data, np.linspace(0, 1, 100))
442    for pred in candidate_predictions:
443        # Compute the pinball loss of a constant predictor:
444        constant_pred = np.full(n_samples, fill_value=pred)
445        pbl = mean_pinball_loss(data, constant_pred, alpha=target_quantile)
446
447        # Check that the loss of this constant predictor is greater or equal
448        # than the loss of using the optimal quantile (up to machine
449        # precision):
450        assert pbl >= best_pbl - np.finfo(best_pbl.dtype).eps
451
452        # Check that the value of the pinball loss matches the analytical
453        # formula.
454        expected_pbl = (pred - data[data < pred]).sum() * (1 - target_quantile) + (
455            data[data >= pred] - pred
456        ).sum() * target_quantile
457        expected_pbl /= n_samples
458        assert_almost_equal(expected_pbl, pbl)
459
460    # Check that we can actually recover the target_quantile by minimizing the
461    # pinball loss w.r.t. the constant prediction quantile.
462    def objective_func(x):
463        constant_pred = np.full(n_samples, fill_value=x)
464        return mean_pinball_loss(data, constant_pred, alpha=target_quantile)
465
466    result = optimize.minimize(objective_func, data.mean(), method="Nelder-Mead")
467    assert result.success
468    # The minimum is not unique with limited data, hence the large tolerance.
469    assert result.x == pytest.approx(best_pred, rel=1e-2)
470    assert result.fun == pytest.approx(best_pbl)
471
472
473def test_dummy_quantile_parameter_tuning():
474    # Integration test to check that it is possible to use the pinball loss to
475    # tune the hyperparameter of a quantile regressor. This is conceptually
476    # similar to the previous test but using the scikit-learn estimator and
477    # scoring API instead.
478    n_samples = 1000
479    rng = np.random.RandomState(0)
480    X = rng.normal(size=(n_samples, 5))  # Ignored
481    y = rng.exponential(size=n_samples)
482
483    all_quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
484    for alpha in all_quantiles:
485        neg_mean_pinball_loss = make_scorer(
486            mean_pinball_loss,
487            alpha=alpha,
488            greater_is_better=False,
489        )
490        regressor = DummyRegressor(strategy="quantile", quantile=0.25)
491        grid_search = GridSearchCV(
492            regressor,
493            param_grid=dict(quantile=all_quantiles),
494            scoring=neg_mean_pinball_loss,
495        ).fit(X, y)
496
497        assert grid_search.best_params_["quantile"] == pytest.approx(alpha)
498