1"""Generic interface for least-squares minimization."""
2from warnings import warn
3
4import numpy as np
5from numpy.linalg import norm
6
7from scipy.sparse import issparse, csr_matrix
8from scipy.sparse.linalg import LinearOperator
9from scipy.optimize import _minpack, OptimizeResult
10from scipy.optimize._numdiff import approx_derivative, group_columns
11
12from .trf import trf
13from .dogbox import dogbox
14from .common import EPS, in_bounds, make_strictly_feasible
15
16
17TERMINATION_MESSAGES = {
18    -1: "Improper input parameters status returned from `leastsq`",
19    0: "The maximum number of function evaluations is exceeded.",
20    1: "`gtol` termination condition is satisfied.",
21    2: "`ftol` termination condition is satisfied.",
22    3: "`xtol` termination condition is satisfied.",
23    4: "Both `ftol` and `xtol` termination conditions are satisfied."
24}
25
26
27FROM_MINPACK_TO_COMMON = {
28    0: -1,  # Improper input parameters from MINPACK.
29    1: 2,
30    2: 3,
31    3: 4,
32    4: 1,
33    5: 0
34    # There are 6, 7, 8 for too small tolerance parameters,
35    # but we guard against it by checking ftol, xtol, gtol beforehand.
36}
37
38
39def call_minpack(fun, x0, jac, ftol, xtol, gtol, max_nfev, x_scale, diff_step):
40    n = x0.size
41
42    if diff_step is None:
43        epsfcn = EPS
44    else:
45        epsfcn = diff_step**2
46
47    # Compute MINPACK's `diag`, which is inverse of our `x_scale` and
48    # ``x_scale='jac'`` corresponds to ``diag=None``.
49    if isinstance(x_scale, str) and x_scale == 'jac':
50        diag = None
51    else:
52        diag = 1 / x_scale
53
54    full_output = True
55    col_deriv = False
56    factor = 100.0
57
58    if jac is None:
59        if max_nfev is None:
60            # n squared to account for Jacobian evaluations.
61            max_nfev = 100 * n * (n + 1)
62        x, info, status = _minpack._lmdif(
63            fun, x0, (), full_output, ftol, xtol, gtol,
64            max_nfev, epsfcn, factor, diag)
65    else:
66        if max_nfev is None:
67            max_nfev = 100 * n
68        x, info, status = _minpack._lmder(
69            fun, jac, x0, (), full_output, col_deriv,
70            ftol, xtol, gtol, max_nfev, factor, diag)
71
72    f = info['fvec']
73
74    if callable(jac):
75        J = jac(x)
76    else:
77        J = np.atleast_2d(approx_derivative(fun, x))
78
79    cost = 0.5 * np.dot(f, f)
80    g = J.T.dot(f)
81    g_norm = norm(g, ord=np.inf)
82
83    nfev = info['nfev']
84    njev = info.get('njev', None)
85
86    status = FROM_MINPACK_TO_COMMON[status]
87    active_mask = np.zeros_like(x0, dtype=int)
88
89    return OptimizeResult(
90        x=x, cost=cost, fun=f, jac=J, grad=g, optimality=g_norm,
91        active_mask=active_mask, nfev=nfev, njev=njev, status=status)
92
93
94def prepare_bounds(bounds, n):
95    lb, ub = [np.asarray(b, dtype=float) for b in bounds]
96    if lb.ndim == 0:
97        lb = np.resize(lb, n)
98
99    if ub.ndim == 0:
100        ub = np.resize(ub, n)
101
102    return lb, ub
103
104
105def check_tolerance(ftol, xtol, gtol, method):
106    def check(tol, name):
107        if tol is None:
108            tol = 0
109        elif tol < EPS:
110            warn("Setting `{}` below the machine epsilon ({:.2e}) effectively "
111                 "disables the corresponding termination condition."
112                 .format(name, EPS))
113        return tol
114
115    ftol = check(ftol, "ftol")
116    xtol = check(xtol, "xtol")
117    gtol = check(gtol, "gtol")
118
119    if method == "lm" and (ftol < EPS or xtol < EPS or gtol < EPS):
120        raise ValueError("All tolerances must be higher than machine epsilon "
121                         "({:.2e}) for method 'lm'.".format(EPS))
122    elif ftol < EPS and xtol < EPS and gtol < EPS:
123        raise ValueError("At least one of the tolerances must be higher than "
124                         "machine epsilon ({:.2e}).".format(EPS))
125
126    return ftol, xtol, gtol
127
128
129def check_x_scale(x_scale, x0):
130    if isinstance(x_scale, str) and x_scale == 'jac':
131        return x_scale
132
133    try:
134        x_scale = np.asarray(x_scale, dtype=float)
135        valid = np.all(np.isfinite(x_scale)) and np.all(x_scale > 0)
136    except (ValueError, TypeError):
137        valid = False
138
139    if not valid:
140        raise ValueError("`x_scale` must be 'jac' or array_like with "
141                         "positive numbers.")
142
143    if x_scale.ndim == 0:
144        x_scale = np.resize(x_scale, x0.shape)
145
146    if x_scale.shape != x0.shape:
147        raise ValueError("Inconsistent shapes between `x_scale` and `x0`.")
148
149    return x_scale
150
151
152def check_jac_sparsity(jac_sparsity, m, n):
153    if jac_sparsity is None:
154        return None
155
156    if not issparse(jac_sparsity):
157        jac_sparsity = np.atleast_2d(jac_sparsity)
158
159    if jac_sparsity.shape != (m, n):
160        raise ValueError("`jac_sparsity` has wrong shape.")
161
162    return jac_sparsity, group_columns(jac_sparsity)
163
164
165# Loss functions.
166
167
168def huber(z, rho, cost_only):
169    mask = z <= 1
170    rho[0, mask] = z[mask]
171    rho[0, ~mask] = 2 * z[~mask]**0.5 - 1
172    if cost_only:
173        return
174    rho[1, mask] = 1
175    rho[1, ~mask] = z[~mask]**-0.5
176    rho[2, mask] = 0
177    rho[2, ~mask] = -0.5 * z[~mask]**-1.5
178
179
180def soft_l1(z, rho, cost_only):
181    t = 1 + z
182    rho[0] = 2 * (t**0.5 - 1)
183    if cost_only:
184        return
185    rho[1] = t**-0.5
186    rho[2] = -0.5 * t**-1.5
187
188
189def cauchy(z, rho, cost_only):
190    rho[0] = np.log1p(z)
191    if cost_only:
192        return
193    t = 1 + z
194    rho[1] = 1 / t
195    rho[2] = -1 / t**2
196
197
198def arctan(z, rho, cost_only):
199    rho[0] = np.arctan(z)
200    if cost_only:
201        return
202    t = 1 + z**2
203    rho[1] = 1 / t
204    rho[2] = -2 * z / t**2
205
206
207IMPLEMENTED_LOSSES = dict(linear=None, huber=huber, soft_l1=soft_l1,
208                          cauchy=cauchy, arctan=arctan)
209
210
211def construct_loss_function(m, loss, f_scale):
212    if loss == 'linear':
213        return None
214
215    if not callable(loss):
216        loss = IMPLEMENTED_LOSSES[loss]
217        rho = np.empty((3, m))
218
219        def loss_function(f, cost_only=False):
220            z = (f / f_scale) ** 2
221            loss(z, rho, cost_only=cost_only)
222            if cost_only:
223                return 0.5 * f_scale ** 2 * np.sum(rho[0])
224            rho[0] *= f_scale ** 2
225            rho[2] /= f_scale ** 2
226            return rho
227    else:
228        def loss_function(f, cost_only=False):
229            z = (f / f_scale) ** 2
230            rho = loss(z)
231            if cost_only:
232                return 0.5 * f_scale ** 2 * np.sum(rho[0])
233            rho[0] *= f_scale ** 2
234            rho[2] /= f_scale ** 2
235            return rho
236
237    return loss_function
238
239
240def least_squares(
241        fun, x0, jac='2-point', bounds=(-np.inf, np.inf), method='trf',
242        ftol=1e-8, xtol=1e-8, gtol=1e-8, x_scale=1.0, loss='linear',
243        f_scale=1.0, diff_step=None, tr_solver=None, tr_options={},
244        jac_sparsity=None, max_nfev=None, verbose=0, args=(), kwargs={}):
245    """Solve a nonlinear least-squares problem with bounds on the variables.
246
247    Given the residuals f(x) (an m-D real function of n real
248    variables) and the loss function rho(s) (a scalar function), `least_squares`
249    finds a local minimum of the cost function F(x)::
250
251        minimize F(x) = 0.5 * sum(rho(f_i(x)**2), i = 0, ..., m - 1)
252        subject to lb <= x <= ub
253
254    The purpose of the loss function rho(s) is to reduce the influence of
255    outliers on the solution.
256
257    Parameters
258    ----------
259    fun : callable
260        Function which computes the vector of residuals, with the signature
261        ``fun(x, *args, **kwargs)``, i.e., the minimization proceeds with
262        respect to its first argument. The argument ``x`` passed to this
263        function is an ndarray of shape (n,) (never a scalar, even for n=1).
264        It must allocate and return a 1-D array_like of shape (m,) or a scalar.
265        If the argument ``x`` is complex or the function ``fun`` returns
266        complex residuals, it must be wrapped in a real function of real
267        arguments, as shown at the end of the Examples section.
268    x0 : array_like with shape (n,) or float
269        Initial guess on independent variables. If float, it will be treated
270        as a 1-D array with one element.
271    jac : {'2-point', '3-point', 'cs', callable}, optional
272        Method of computing the Jacobian matrix (an m-by-n matrix, where
273        element (i, j) is the partial derivative of f[i] with respect to
274        x[j]). The keywords select a finite difference scheme for numerical
275        estimation. The scheme '3-point' is more accurate, but requires
276        twice as many operations as '2-point' (default). The scheme 'cs'
277        uses complex steps, and while potentially the most accurate, it is
278        applicable only when `fun` correctly handles complex inputs and
279        can be analytically continued to the complex plane. Method 'lm'
280        always uses the '2-point' scheme. If callable, it is used as
281        ``jac(x, *args, **kwargs)`` and should return a good approximation
282        (or the exact value) for the Jacobian as an array_like (np.atleast_2d
283        is applied), a sparse matrix (csr_matrix preferred for performance) or
284        a `scipy.sparse.linalg.LinearOperator`.
285    bounds : 2-tuple of array_like, optional
286        Lower and upper bounds on independent variables. Defaults to no bounds.
287        Each array must match the size of `x0` or be a scalar, in the latter
288        case a bound will be the same for all variables. Use ``np.inf`` with
289        an appropriate sign to disable bounds on all or some variables.
290    method : {'trf', 'dogbox', 'lm'}, optional
291        Algorithm to perform minimization.
292
293            * 'trf' : Trust Region Reflective algorithm, particularly suitable
294              for large sparse problems with bounds. Generally robust method.
295            * 'dogbox' : dogleg algorithm with rectangular trust regions,
296              typical use case is small problems with bounds. Not recommended
297              for problems with rank-deficient Jacobian.
298            * 'lm' : Levenberg-Marquardt algorithm as implemented in MINPACK.
299              Doesn't handle bounds and sparse Jacobians. Usually the most
300              efficient method for small unconstrained problems.
301
302        Default is 'trf'. See Notes for more information.
303    ftol : float or None, optional
304        Tolerance for termination by the change of the cost function. Default
305        is 1e-8. The optimization process is stopped when ``dF < ftol * F``,
306        and there was an adequate agreement between a local quadratic model and
307        the true model in the last step.
308
309        If None and 'method' is not 'lm', the termination by this condition is
310        disabled. If 'method' is 'lm', this tolerance must be higher than
311        machine epsilon.
312    xtol : float or None, optional
313        Tolerance for termination by the change of the independent variables.
314        Default is 1e-8. The exact condition depends on the `method` used:
315
316            * For 'trf' and 'dogbox' : ``norm(dx) < xtol * (xtol + norm(x))``.
317            * For 'lm' : ``Delta < xtol * norm(xs)``, where ``Delta`` is
318              a trust-region radius and ``xs`` is the value of ``x``
319              scaled according to `x_scale` parameter (see below).
320
321        If None and 'method' is not 'lm', the termination by this condition is
322        disabled. If 'method' is 'lm', this tolerance must be higher than
323        machine epsilon.
324    gtol : float or None, optional
325        Tolerance for termination by the norm of the gradient. Default is 1e-8.
326        The exact condition depends on a `method` used:
327
328            * For 'trf' : ``norm(g_scaled, ord=np.inf) < gtol``, where
329              ``g_scaled`` is the value of the gradient scaled to account for
330              the presence of the bounds [STIR]_.
331            * For 'dogbox' : ``norm(g_free, ord=np.inf) < gtol``, where
332              ``g_free`` is the gradient with respect to the variables which
333              are not in the optimal state on the boundary.
334            * For 'lm' : the maximum absolute value of the cosine of angles
335              between columns of the Jacobian and the residual vector is less
336              than `gtol`, or the residual vector is zero.
337
338        If None and 'method' is not 'lm', the termination by this condition is
339        disabled. If 'method' is 'lm', this tolerance must be higher than
340        machine epsilon.
341    x_scale : array_like or 'jac', optional
342        Characteristic scale of each variable. Setting `x_scale` is equivalent
343        to reformulating the problem in scaled variables ``xs = x / x_scale``.
344        An alternative view is that the size of a trust region along jth
345        dimension is proportional to ``x_scale[j]``. Improved convergence may
346        be achieved by setting `x_scale` such that a step of a given size
347        along any of the scaled variables has a similar effect on the cost
348        function. If set to 'jac', the scale is iteratively updated using the
349        inverse norms of the columns of the Jacobian matrix (as described in
350        [JJMore]_).
351    loss : str or callable, optional
352        Determines the loss function. The following keyword values are allowed:
353
354            * 'linear' (default) : ``rho(z) = z``. Gives a standard
355              least-squares problem.
356            * 'soft_l1' : ``rho(z) = 2 * ((1 + z)**0.5 - 1)``. The smooth
357              approximation of l1 (absolute value) loss. Usually a good
358              choice for robust least squares.
359            * 'huber' : ``rho(z) = z if z <= 1 else 2*z**0.5 - 1``. Works
360              similarly to 'soft_l1'.
361            * 'cauchy' : ``rho(z) = ln(1 + z)``. Severely weakens outliers
362              influence, but may cause difficulties in optimization process.
363            * 'arctan' : ``rho(z) = arctan(z)``. Limits a maximum loss on
364              a single residual, has properties similar to 'cauchy'.
365
366        If callable, it must take a 1-D ndarray ``z=f**2`` and return an
367        array_like with shape (3, m) where row 0 contains function values,
368        row 1 contains first derivatives and row 2 contains second
369        derivatives. Method 'lm' supports only 'linear' loss.
370    f_scale : float, optional
371        Value of soft margin between inlier and outlier residuals, default
372        is 1.0. The loss function is evaluated as follows
373        ``rho_(f**2) = C**2 * rho(f**2 / C**2)``, where ``C`` is `f_scale`,
374        and ``rho`` is determined by `loss` parameter. This parameter has
375        no effect with ``loss='linear'``, but for other `loss` values it is
376        of crucial importance.
377    max_nfev : None or int, optional
378        Maximum number of function evaluations before the termination.
379        If None (default), the value is chosen automatically:
380
381            * For 'trf' and 'dogbox' : 100 * n.
382            * For 'lm' :  100 * n if `jac` is callable and 100 * n * (n + 1)
383              otherwise (because 'lm' counts function calls in Jacobian
384              estimation).
385
386    diff_step : None or array_like, optional
387        Determines the relative step size for the finite difference
388        approximation of the Jacobian. The actual step is computed as
389        ``x * diff_step``. If None (default), then `diff_step` is taken to be
390        a conventional "optimal" power of machine epsilon for the finite
391        difference scheme used [NR]_.
392    tr_solver : {None, 'exact', 'lsmr'}, optional
393        Method for solving trust-region subproblems, relevant only for 'trf'
394        and 'dogbox' methods.
395
396            * 'exact' is suitable for not very large problems with dense
397              Jacobian matrices. The computational complexity per iteration is
398              comparable to a singular value decomposition of the Jacobian
399              matrix.
400            * 'lsmr' is suitable for problems with sparse and large Jacobian
401              matrices. It uses the iterative procedure
402              `scipy.sparse.linalg.lsmr` for finding a solution of a linear
403              least-squares problem and only requires matrix-vector product
404              evaluations.
405
406        If None (default), the solver is chosen based on the type of Jacobian
407        returned on the first iteration.
408    tr_options : dict, optional
409        Keyword options passed to trust-region solver.
410
411            * ``tr_solver='exact'``: `tr_options` are ignored.
412            * ``tr_solver='lsmr'``: options for `scipy.sparse.linalg.lsmr`.
413              Additionally,  ``method='trf'`` supports  'regularize' option
414              (bool, default is True), which adds a regularization term to the
415              normal equation, which improves convergence if the Jacobian is
416              rank-deficient [Byrd]_ (eq. 3.4).
417
418    jac_sparsity : {None, array_like, sparse matrix}, optional
419        Defines the sparsity structure of the Jacobian matrix for finite
420        difference estimation, its shape must be (m, n). If the Jacobian has
421        only few non-zero elements in *each* row, providing the sparsity
422        structure will greatly speed up the computations [Curtis]_. A zero
423        entry means that a corresponding element in the Jacobian is identically
424        zero. If provided, forces the use of 'lsmr' trust-region solver.
425        If None (default), then dense differencing will be used. Has no effect
426        for 'lm' method.
427    verbose : {0, 1, 2}, optional
428        Level of algorithm's verbosity:
429
430            * 0 (default) : work silently.
431            * 1 : display a termination report.
432            * 2 : display progress during iterations (not supported by 'lm'
433              method).
434
435    args, kwargs : tuple and dict, optional
436        Additional arguments passed to `fun` and `jac`. Both empty by default.
437        The calling signature is ``fun(x, *args, **kwargs)`` and the same for
438        `jac`.
439
440    Returns
441    -------
442    result : OptimizeResult
443        `OptimizeResult` with the following fields defined:
444
445            x : ndarray, shape (n,)
446                Solution found.
447            cost : float
448                Value of the cost function at the solution.
449            fun : ndarray, shape (m,)
450                Vector of residuals at the solution.
451            jac : ndarray, sparse matrix or LinearOperator, shape (m, n)
452                Modified Jacobian matrix at the solution, in the sense that J^T J
453                is a Gauss-Newton approximation of the Hessian of the cost function.
454                The type is the same as the one used by the algorithm.
455            grad : ndarray, shape (m,)
456                Gradient of the cost function at the solution.
457            optimality : float
458                First-order optimality measure. In unconstrained problems, it is
459                always the uniform norm of the gradient. In constrained problems,
460                it is the quantity which was compared with `gtol` during iterations.
461            active_mask : ndarray of int, shape (n,)
462                Each component shows whether a corresponding constraint is active
463                (that is, whether a variable is at the bound):
464
465                    *  0 : a constraint is not active.
466                    * -1 : a lower bound is active.
467                    *  1 : an upper bound is active.
468
469                Might be somewhat arbitrary for 'trf' method as it generates a
470                sequence of strictly feasible iterates and `active_mask` is
471                determined within a tolerance threshold.
472            nfev : int
473                Number of function evaluations done. Methods 'trf' and 'dogbox' do
474                not count function calls for numerical Jacobian approximation, as
475                opposed to 'lm' method.
476            njev : int or None
477                Number of Jacobian evaluations done. If numerical Jacobian
478                approximation is used in 'lm' method, it is set to None.
479            status : int
480                The reason for algorithm termination:
481
482                    * -1 : improper input parameters status returned from MINPACK.
483                    *  0 : the maximum number of function evaluations is exceeded.
484                    *  1 : `gtol` termination condition is satisfied.
485                    *  2 : `ftol` termination condition is satisfied.
486                    *  3 : `xtol` termination condition is satisfied.
487                    *  4 : Both `ftol` and `xtol` termination conditions are satisfied.
488
489            message : str
490                Verbal description of the termination reason.
491            success : bool
492                True if one of the convergence criteria is satisfied (`status` > 0).
493
494    See Also
495    --------
496    leastsq : A legacy wrapper for the MINPACK implementation of the
497              Levenberg-Marquadt algorithm.
498    curve_fit : Least-squares minimization applied to a curve-fitting problem.
499
500    Notes
501    -----
502    Method 'lm' (Levenberg-Marquardt) calls a wrapper over least-squares
503    algorithms implemented in MINPACK (lmder, lmdif). It runs the
504    Levenberg-Marquardt algorithm formulated as a trust-region type algorithm.
505    The implementation is based on paper [JJMore]_, it is very robust and
506    efficient with a lot of smart tricks. It should be your first choice
507    for unconstrained problems. Note that it doesn't support bounds. Also,
508    it doesn't work when m < n.
509
510    Method 'trf' (Trust Region Reflective) is motivated by the process of
511    solving a system of equations, which constitute the first-order optimality
512    condition for a bound-constrained minimization problem as formulated in
513    [STIR]_. The algorithm iteratively solves trust-region subproblems
514    augmented by a special diagonal quadratic term and with trust-region shape
515    determined by the distance from the bounds and the direction of the
516    gradient. This enhancements help to avoid making steps directly into bounds
517    and efficiently explore the whole space of variables. To further improve
518    convergence, the algorithm considers search directions reflected from the
519    bounds. To obey theoretical requirements, the algorithm keeps iterates
520    strictly feasible. With dense Jacobians trust-region subproblems are
521    solved by an exact method very similar to the one described in [JJMore]_
522    (and implemented in MINPACK). The difference from the MINPACK
523    implementation is that a singular value decomposition of a Jacobian
524    matrix is done once per iteration, instead of a QR decomposition and series
525    of Givens rotation eliminations. For large sparse Jacobians a 2-D subspace
526    approach of solving trust-region subproblems is used [STIR]_, [Byrd]_.
527    The subspace is spanned by a scaled gradient and an approximate
528    Gauss-Newton solution delivered by `scipy.sparse.linalg.lsmr`. When no
529    constraints are imposed the algorithm is very similar to MINPACK and has
530    generally comparable performance. The algorithm works quite robust in
531    unbounded and bounded problems, thus it is chosen as a default algorithm.
532
533    Method 'dogbox' operates in a trust-region framework, but considers
534    rectangular trust regions as opposed to conventional ellipsoids [Voglis]_.
535    The intersection of a current trust region and initial bounds is again
536    rectangular, so on each iteration a quadratic minimization problem subject
537    to bound constraints is solved approximately by Powell's dogleg method
538    [NumOpt]_. The required Gauss-Newton step can be computed exactly for
539    dense Jacobians or approximately by `scipy.sparse.linalg.lsmr` for large
540    sparse Jacobians. The algorithm is likely to exhibit slow convergence when
541    the rank of Jacobian is less than the number of variables. The algorithm
542    often outperforms 'trf' in bounded problems with a small number of
543    variables.
544
545    Robust loss functions are implemented as described in [BA]_. The idea
546    is to modify a residual vector and a Jacobian matrix on each iteration
547    such that computed gradient and Gauss-Newton Hessian approximation match
548    the true gradient and Hessian approximation of the cost function. Then
549    the algorithm proceeds in a normal way, i.e., robust loss functions are
550    implemented as a simple wrapper over standard least-squares algorithms.
551
552    .. versionadded:: 0.17.0
553
554    References
555    ----------
556    .. [STIR] M. A. Branch, T. F. Coleman, and Y. Li, "A Subspace, Interior,
557              and Conjugate Gradient Method for Large-Scale Bound-Constrained
558              Minimization Problems," SIAM Journal on Scientific Computing,
559              Vol. 21, Number 1, pp 1-23, 1999.
560    .. [NR] William H. Press et. al., "Numerical Recipes. The Art of Scientific
561            Computing. 3rd edition", Sec. 5.7.
562    .. [Byrd] R. H. Byrd, R. B. Schnabel and G. A. Shultz, "Approximate
563              solution of the trust region problem by minimization over
564              two-dimensional subspaces", Math. Programming, 40, pp. 247-263,
565              1988.
566    .. [Curtis] A. Curtis, M. J. D. Powell, and J. Reid, "On the estimation of
567                sparse Jacobian matrices", Journal of the Institute of
568                Mathematics and its Applications, 13, pp. 117-120, 1974.
569    .. [JJMore] J. J. More, "The Levenberg-Marquardt Algorithm: Implementation
570                and Theory," Numerical Analysis, ed. G. A. Watson, Lecture
571                Notes in Mathematics 630, Springer Verlag, pp. 105-116, 1977.
572    .. [Voglis] C. Voglis and I. E. Lagaris, "A Rectangular Trust Region
573                Dogleg Approach for Unconstrained and Bound Constrained
574                Nonlinear Optimization", WSEAS International Conference on
575                Applied Mathematics, Corfu, Greece, 2004.
576    .. [NumOpt] J. Nocedal and S. J. Wright, "Numerical optimization,
577                2nd edition", Chapter 4.
578    .. [BA] B. Triggs et. al., "Bundle Adjustment - A Modern Synthesis",
579            Proceedings of the International Workshop on Vision Algorithms:
580            Theory and Practice, pp. 298-372, 1999.
581
582    Examples
583    --------
584    In this example we find a minimum of the Rosenbrock function without bounds
585    on independent variables.
586
587    >>> def fun_rosenbrock(x):
588    ...     return np.array([10 * (x[1] - x[0]**2), (1 - x[0])])
589
590    Notice that we only provide the vector of the residuals. The algorithm
591    constructs the cost function as a sum of squares of the residuals, which
592    gives the Rosenbrock function. The exact minimum is at ``x = [1.0, 1.0]``.
593
594    >>> from scipy.optimize import least_squares
595    >>> x0_rosenbrock = np.array([2, 2])
596    >>> res_1 = least_squares(fun_rosenbrock, x0_rosenbrock)
597    >>> res_1.x
598    array([ 1.,  1.])
599    >>> res_1.cost
600    9.8669242910846867e-30
601    >>> res_1.optimality
602    8.8928864934219529e-14
603
604    We now constrain the variables, in such a way that the previous solution
605    becomes infeasible. Specifically, we require that ``x[1] >= 1.5``, and
606    ``x[0]`` left unconstrained. To this end, we specify the `bounds` parameter
607    to `least_squares` in the form ``bounds=([-np.inf, 1.5], np.inf)``.
608
609    We also provide the analytic Jacobian:
610
611    >>> def jac_rosenbrock(x):
612    ...     return np.array([
613    ...         [-20 * x[0], 10],
614    ...         [-1, 0]])
615
616    Putting this all together, we see that the new solution lies on the bound:
617
618    >>> res_2 = least_squares(fun_rosenbrock, x0_rosenbrock, jac_rosenbrock,
619    ...                       bounds=([-np.inf, 1.5], np.inf))
620    >>> res_2.x
621    array([ 1.22437075,  1.5       ])
622    >>> res_2.cost
623    0.025213093946805685
624    >>> res_2.optimality
625    1.5885401433157753e-07
626
627    Now we solve a system of equations (i.e., the cost function should be zero
628    at a minimum) for a Broyden tridiagonal vector-valued function of 100000
629    variables:
630
631    >>> def fun_broyden(x):
632    ...     f = (3 - x) * x + 1
633    ...     f[1:] -= x[:-1]
634    ...     f[:-1] -= 2 * x[1:]
635    ...     return f
636
637    The corresponding Jacobian matrix is sparse. We tell the algorithm to
638    estimate it by finite differences and provide the sparsity structure of
639    Jacobian to significantly speed up this process.
640
641    >>> from scipy.sparse import lil_matrix
642    >>> def sparsity_broyden(n):
643    ...     sparsity = lil_matrix((n, n), dtype=int)
644    ...     i = np.arange(n)
645    ...     sparsity[i, i] = 1
646    ...     i = np.arange(1, n)
647    ...     sparsity[i, i - 1] = 1
648    ...     i = np.arange(n - 1)
649    ...     sparsity[i, i + 1] = 1
650    ...     return sparsity
651    ...
652    >>> n = 100000
653    >>> x0_broyden = -np.ones(n)
654    ...
655    >>> res_3 = least_squares(fun_broyden, x0_broyden,
656    ...                       jac_sparsity=sparsity_broyden(n))
657    >>> res_3.cost
658    4.5687069299604613e-23
659    >>> res_3.optimality
660    1.1650454296851518e-11
661
662    Let's also solve a curve fitting problem using robust loss function to
663    take care of outliers in the data. Define the model function as
664    ``y = a + b * exp(c * t)``, where t is a predictor variable, y is an
665    observation and a, b, c are parameters to estimate.
666
667    First, define the function which generates the data with noise and
668    outliers, define the model parameters, and generate data:
669
670    >>> from numpy.random import default_rng
671    >>> rng = default_rng()
672    >>> def gen_data(t, a, b, c, noise=0., n_outliers=0, seed=None):
673    ...     rng = default_rng(seed)
674    ...
675    ...     y = a + b * np.exp(t * c)
676    ...
677    ...     error = noise * rng.standard_normal(t.size)
678    ...     outliers = rng.integers(0, t.size, n_outliers)
679    ...     error[outliers] *= 10
680    ...
681    ...     return y + error
682    ...
683    >>> a = 0.5
684    >>> b = 2.0
685    >>> c = -1
686    >>> t_min = 0
687    >>> t_max = 10
688    >>> n_points = 15
689    ...
690    >>> t_train = np.linspace(t_min, t_max, n_points)
691    >>> y_train = gen_data(t_train, a, b, c, noise=0.1, n_outliers=3)
692
693    Define function for computing residuals and initial estimate of
694    parameters.
695
696    >>> def fun(x, t, y):
697    ...     return x[0] + x[1] * np.exp(x[2] * t) - y
698    ...
699    >>> x0 = np.array([1.0, 1.0, 0.0])
700
701    Compute a standard least-squares solution:
702
703    >>> res_lsq = least_squares(fun, x0, args=(t_train, y_train))
704
705    Now compute two solutions with two different robust loss functions. The
706    parameter `f_scale` is set to 0.1, meaning that inlier residuals should
707    not significantly exceed 0.1 (the noise level used).
708
709    >>> res_soft_l1 = least_squares(fun, x0, loss='soft_l1', f_scale=0.1,
710    ...                             args=(t_train, y_train))
711    >>> res_log = least_squares(fun, x0, loss='cauchy', f_scale=0.1,
712    ...                         args=(t_train, y_train))
713
714    And, finally, plot all the curves. We see that by selecting an appropriate
715    `loss`  we can get estimates close to optimal even in the presence of
716    strong outliers. But keep in mind that generally it is recommended to try
717    'soft_l1' or 'huber' losses first (if at all necessary) as the other two
718    options may cause difficulties in optimization process.
719
720    >>> t_test = np.linspace(t_min, t_max, n_points * 10)
721    >>> y_true = gen_data(t_test, a, b, c)
722    >>> y_lsq = gen_data(t_test, *res_lsq.x)
723    >>> y_soft_l1 = gen_data(t_test, *res_soft_l1.x)
724    >>> y_log = gen_data(t_test, *res_log.x)
725    ...
726    >>> import matplotlib.pyplot as plt
727    >>> plt.plot(t_train, y_train, 'o')
728    >>> plt.plot(t_test, y_true, 'k', linewidth=2, label='true')
729    >>> plt.plot(t_test, y_lsq, label='linear loss')
730    >>> plt.plot(t_test, y_soft_l1, label='soft_l1 loss')
731    >>> plt.plot(t_test, y_log, label='cauchy loss')
732    >>> plt.xlabel("t")
733    >>> plt.ylabel("y")
734    >>> plt.legend()
735    >>> plt.show()
736
737    In the next example, we show how complex-valued residual functions of
738    complex variables can be optimized with ``least_squares()``. Consider the
739    following function:
740
741    >>> def f(z):
742    ...     return z - (0.5 + 0.5j)
743
744    We wrap it into a function of real variables that returns real residuals
745    by simply handling the real and imaginary parts as independent variables:
746
747    >>> def f_wrap(x):
748    ...     fx = f(x[0] + 1j*x[1])
749    ...     return np.array([fx.real, fx.imag])
750
751    Thus, instead of the original m-D complex function of n complex
752    variables we optimize a 2m-D real function of 2n real variables:
753
754    >>> from scipy.optimize import least_squares
755    >>> res_wrapped = least_squares(f_wrap, (0.1, 0.1), bounds=([0, 0], [1, 1]))
756    >>> z = res_wrapped.x[0] + res_wrapped.x[1]*1j
757    >>> z
758    (0.49999999999925893+0.49999999999925893j)
759
760    """
761    if method not in ['trf', 'dogbox', 'lm']:
762        raise ValueError("`method` must be 'trf', 'dogbox' or 'lm'.")
763
764    if jac not in ['2-point', '3-point', 'cs'] and not callable(jac):
765        raise ValueError("`jac` must be '2-point', '3-point', 'cs' or "
766                         "callable.")
767
768    if tr_solver not in [None, 'exact', 'lsmr']:
769        raise ValueError("`tr_solver` must be None, 'exact' or 'lsmr'.")
770
771    if loss not in IMPLEMENTED_LOSSES and not callable(loss):
772        raise ValueError("`loss` must be one of {0} or a callable."
773                         .format(IMPLEMENTED_LOSSES.keys()))
774
775    if method == 'lm' and loss != 'linear':
776        raise ValueError("method='lm' supports only 'linear' loss function.")
777
778    if verbose not in [0, 1, 2]:
779        raise ValueError("`verbose` must be in [0, 1, 2].")
780
781    if len(bounds) != 2:
782        raise ValueError("`bounds` must contain 2 elements.")
783
784    if max_nfev is not None and max_nfev <= 0:
785        raise ValueError("`max_nfev` must be None or positive integer.")
786
787    if np.iscomplexobj(x0):
788        raise ValueError("`x0` must be real.")
789
790    x0 = np.atleast_1d(x0).astype(float)
791
792    if x0.ndim > 1:
793        raise ValueError("`x0` must have at most 1 dimension.")
794
795    lb, ub = prepare_bounds(bounds, x0.shape[0])
796
797    if method == 'lm' and not np.all((lb == -np.inf) & (ub == np.inf)):
798        raise ValueError("Method 'lm' doesn't support bounds.")
799
800    if lb.shape != x0.shape or ub.shape != x0.shape:
801        raise ValueError("Inconsistent shapes between bounds and `x0`.")
802
803    if np.any(lb >= ub):
804        raise ValueError("Each lower bound must be strictly less than each "
805                         "upper bound.")
806
807    if not in_bounds(x0, lb, ub):
808        raise ValueError("`x0` is infeasible.")
809
810    x_scale = check_x_scale(x_scale, x0)
811
812    ftol, xtol, gtol = check_tolerance(ftol, xtol, gtol, method)
813
814    def fun_wrapped(x):
815        return np.atleast_1d(fun(x, *args, **kwargs))
816
817    if method == 'trf':
818        x0 = make_strictly_feasible(x0, lb, ub)
819
820    f0 = fun_wrapped(x0)
821
822    if f0.ndim != 1:
823        raise ValueError("`fun` must return at most 1-d array_like. "
824                         "f0.shape: {0}".format(f0.shape))
825
826    if not np.all(np.isfinite(f0)):
827        raise ValueError("Residuals are not finite in the initial point.")
828
829    n = x0.size
830    m = f0.size
831
832    if method == 'lm' and m < n:
833        raise ValueError("Method 'lm' doesn't work when the number of "
834                         "residuals is less than the number of variables.")
835
836    loss_function = construct_loss_function(m, loss, f_scale)
837    if callable(loss):
838        rho = loss_function(f0)
839        if rho.shape != (3, m):
840            raise ValueError("The return value of `loss` callable has wrong "
841                             "shape.")
842        initial_cost = 0.5 * np.sum(rho[0])
843    elif loss_function is not None:
844        initial_cost = loss_function(f0, cost_only=True)
845    else:
846        initial_cost = 0.5 * np.dot(f0, f0)
847
848    if callable(jac):
849        J0 = jac(x0, *args, **kwargs)
850
851        if issparse(J0):
852            J0 = J0.tocsr()
853
854            def jac_wrapped(x, _=None):
855                return jac(x, *args, **kwargs).tocsr()
856
857        elif isinstance(J0, LinearOperator):
858            def jac_wrapped(x, _=None):
859                return jac(x, *args, **kwargs)
860
861        else:
862            J0 = np.atleast_2d(J0)
863
864            def jac_wrapped(x, _=None):
865                return np.atleast_2d(jac(x, *args, **kwargs))
866
867    else:  # Estimate Jacobian by finite differences.
868        if method == 'lm':
869            if jac_sparsity is not None:
870                raise ValueError("method='lm' does not support "
871                                 "`jac_sparsity`.")
872
873            if jac != '2-point':
874                warn("jac='{0}' works equivalently to '2-point' "
875                     "for method='lm'.".format(jac))
876
877            J0 = jac_wrapped = None
878        else:
879            if jac_sparsity is not None and tr_solver == 'exact':
880                raise ValueError("tr_solver='exact' is incompatible "
881                                 "with `jac_sparsity`.")
882
883            jac_sparsity = check_jac_sparsity(jac_sparsity, m, n)
884
885            def jac_wrapped(x, f):
886                J = approx_derivative(fun, x, rel_step=diff_step, method=jac,
887                                      f0=f, bounds=bounds, args=args,
888                                      kwargs=kwargs, sparsity=jac_sparsity)
889                if J.ndim != 2:  # J is guaranteed not sparse.
890                    J = np.atleast_2d(J)
891
892                return J
893
894            J0 = jac_wrapped(x0, f0)
895
896    if J0 is not None:
897        if J0.shape != (m, n):
898            raise ValueError(
899                "The return value of `jac` has wrong shape: expected {0}, "
900                "actual {1}.".format((m, n), J0.shape))
901
902        if not isinstance(J0, np.ndarray):
903            if method == 'lm':
904                raise ValueError("method='lm' works only with dense "
905                                 "Jacobian matrices.")
906
907            if tr_solver == 'exact':
908                raise ValueError(
909                    "tr_solver='exact' works only with dense "
910                    "Jacobian matrices.")
911
912        jac_scale = isinstance(x_scale, str) and x_scale == 'jac'
913        if isinstance(J0, LinearOperator) and jac_scale:
914            raise ValueError("x_scale='jac' can't be used when `jac` "
915                             "returns LinearOperator.")
916
917        if tr_solver is None:
918            if isinstance(J0, np.ndarray):
919                tr_solver = 'exact'
920            else:
921                tr_solver = 'lsmr'
922
923    if method == 'lm':
924        result = call_minpack(fun_wrapped, x0, jac_wrapped, ftol, xtol, gtol,
925                              max_nfev, x_scale, diff_step)
926
927    elif method == 'trf':
928        result = trf(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol, xtol,
929                     gtol, max_nfev, x_scale, loss_function, tr_solver,
930                     tr_options.copy(), verbose)
931
932    elif method == 'dogbox':
933        if tr_solver == 'lsmr' and 'regularize' in tr_options:
934            warn("The keyword 'regularize' in `tr_options` is not relevant "
935                 "for 'dogbox' method.")
936            tr_options = tr_options.copy()
937            del tr_options['regularize']
938
939        result = dogbox(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol,
940                        xtol, gtol, max_nfev, x_scale, loss_function,
941                        tr_solver, tr_options, verbose)
942
943    result.message = TERMINATION_MESSAGES[result.status]
944    result.success = result.status > 0
945
946    if verbose >= 1:
947        print(result.message)
948        print("Function evaluations {0}, initial cost {1:.4e}, final cost "
949              "{2:.4e}, first-order optimality {3:.2e}."
950              .format(result.nfev, initial_cost, result.cost,
951                      result.optimality))
952
953    return result
954