# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""line search"""
from typing import NamedTuple
from ... import nn
from ... import numpy as mnp
from ...common import dtype as mstype
from ...common import Tensor
from ..utils import _to_scalar, _to_tensor, grad
class _LineSearchResults(NamedTuple):
"""Results of line search results.
Args:
failed (bool): `True`` if the strong Wolfe criteria were satisfied
nit (int): number of iterations
nfev (int): number of functions evaluations
ngev (int): number of gradients evaluations
k (int): number of iterations
a_k (float): step size
f_k (float): final function value
g_k (Tensor): final gradient value
status (int): end status
"""
failed: bool
nit: int
nfev: int
ngev: int
k: int
a_k: float
f_k: float
g_k: Tensor
status: int
def _cubicmin(a, fa, fpa, b, fb, c, fc):
"""Finds the minimizer for a cubic polynomial that goes through the
points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
"""
db = b - a
dc = c - a
denom = (db * dc) ** 2 * (db - dc)
d1 = mnp.zeros((2, 2))
d1[0, 0] = dc ** 2
d1[0, 1] = -db ** 2
d1[1, 0] = -dc ** 3
d1[1, 1] = db ** 3
d2 = mnp.zeros((2,))
d2[0] = fb - fa - fpa * db
d2[1] = fc - fa - fpa * dc
a2, b2 = mnp.dot(d1, d2) / denom
radical = b2 * b2 - 3. * a2 * fpa
xmin = a + (-b2 + mnp.sqrt(radical)) / (3. * a2)
return xmin
def _quadmin(a, fa, fpa, b, fb):
"""Finds the minimizer for a quadratic polynomial that goes through
the points (a,fa), (b,fb) with derivative at a of fpa.
"""
db = b - a
b2 = (fb - fa - fpa * db) / (db ** 2)
xmin = a - fpa / (2. * b2)
return xmin
def _zoom(fn, a_low, phi_low, dphi_low, a_high, phi_high, dphi_high, phi_0, g_0, dphi_0, c1, c2, is_run):
"""Implementation of zoom algorithm.
Algorithm 3.6 from Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-61.
Tries cubic, quadratic, and bisection methods of zooming.
"""
# Constant tensors which avoid loop unrolling
const_float_one = _to_tensor(1., dtype=a_low.dtype)
const_bool_false = _to_tensor(False)
const_int_zero = _to_tensor(0)
state = {
"done": const_bool_false,
"failed": const_bool_false,
"j": const_int_zero,
"a_low": a_low,
"phi_low": phi_low,
"dphi_low": dphi_low,
"a_high": a_high,
"phi_high": phi_high,
"dphi_high": dphi_high,
"a_rec": (a_low + a_high) / 2.,
"phi_rec": (phi_low + phi_high) / 2.,
"a_star": const_float_one,
"phi_star": phi_low,
"dphi_star": dphi_low,
"g_star": g_0,
"nfev": const_int_zero,
"ngev": const_int_zero,
}
if mnp.logical_not(is_run):
return state
delta1 = 0.2
delta2 = 0.1
maxiter = 10 # scipy: 10 jax: 30
while mnp.logical_not(state["done"]) and state["j"] < maxiter:
dalpha = state["a_high"] - state["a_low"]
a = mnp.minimum(state["a_low"], state["a_high"])
b = mnp.maximum(state["a_low"], state["a_high"])
cchk = delta1 * dalpha
qchk = delta2 * dalpha
a_j_cubic = _cubicmin(state["a_low"], state["phi_low"], state["dphi_low"], state["a_high"],
state["phi_high"], state["a_rec"], state["phi_rec"])
use_cubic = state["j"] > 0 and mnp.isfinite(a_j_cubic) and \
mnp.logical_and(a_j_cubic > a + cchk, a_j_cubic < b - cchk)
a_j_quad = _quadmin(state["a_low"], state["phi_low"], state["dphi_low"], state["a_high"],
state["phi_high"])
use_quad = mnp.logical_not(use_cubic) and mnp.isfinite(a_j_quad) and \
mnp.logical_and(a_j_quad > a + qchk, a_j_quad < b - qchk)
a_j_bisection = (state["a_low"] + state["a_high"]) / 2.0
use_bisection = mnp.logical_not(use_cubic) and mnp.logical_not(use_quad)
a_j = mnp.where(use_cubic, a_j_cubic, state["a_rec"])
a_j = mnp.where(use_quad, a_j_quad, a_j)
a_j = mnp.where(use_bisection, a_j_bisection, a_j)
phi_j, g_j, dphi_j = fn(a_j)
state["nfev"] += 1
state["ngev"] += 1
j_to_high = (phi_j > phi_0 + c1 * a_j * dphi_0) or (phi_j >= state["phi_low"])
state["a_rec"] = mnp.where(j_to_high, state["a_high"], state["a_rec"])
state["phi_rec"] = mnp.where(j_to_high, state["phi_high"], state["phi_rec"])
state["a_high"] = mnp.where(j_to_high, a_j, state["a_high"])
state["phi_high"] = mnp.where(j_to_high, phi_j, state["phi_high"])
state["dphi_high"] = mnp.where(j_to_high, dphi_j, state["dphi_high"])
j_to_star = mnp.logical_not(j_to_high) and mnp.abs(dphi_j) <= -c2 * dphi_0
state["done"] = j_to_star
state["a_star"] = mnp.where(j_to_star, a_j, state["a_star"])
state["phi_star"] = mnp.where(j_to_star, phi_j, state["phi_star"])
state["g_star"] = mnp.where(j_to_star, g_j, state["g_star"])
state["dphi_star"] = mnp.where(j_to_star, dphi_j, state["dphi_star"])
low_to_high = mnp.logical_not(j_to_high) and mnp.logical_not(j_to_star) and \
dphi_j * (state["a_high"] - state["a_low"]) >= 0.
state["a_rec"] = mnp.where(low_to_high, state["a_high"], state["a_rec"])
state["phi_rec"] = mnp.where(low_to_high, state["phi_high"], state["phi_rec"])
state["a_high"] = mnp.where(low_to_high, a_low, state["a_high"])
state["phi_high"] = mnp.where(low_to_high, phi_low, state["phi_high"])
state["dphi_high"] = mnp.where(low_to_high, dphi_low, state["dphi_high"])
j_to_low = mnp.logical_not(j_to_high) and mnp.logical_not(j_to_star)
state["a_rec"] = mnp.where(j_to_low, state["a_low"], state["a_rec"])
state["phi_rec"] = mnp.where(j_to_low, state["phi_low"], state["phi_rec"])
state["a_low"] = mnp.where(j_to_low, a_j, state["a_low"])
state["phi_low"] = mnp.where(j_to_low, phi_j, state["phi_low"])
state["dphi_low"] = mnp.where(j_to_low, dphi_j, state["dphi_low"])
state["j"] += 1
state["failed"] = state["j"] == maxiter
return state
class LineSearch(nn.Cell):
"""Line Search that satisfies strong Wolfe conditions."""
def __init__(self, func, jac):
"""Initialize LineSearch."""
super(LineSearch, self).__init__()
self.func = func
self.jac = jac
def construct(self, xk, pk, old_fval=None, old_old_fval=None, gfk=None, c1=1e-4, c2=0.9, maxiter=20):
def fval_and_grad(alpha):
xkk = xk + alpha * pk
fkk = self.func(xkk)
gkk = self.jac(xkk)
return fkk, gkk, mnp.dot(gkk, pk)
# Constant tensors which avoid loop unrolling
const_float_zero = _to_tensor(0., dtype=xk.dtype)
const_float_one = _to_tensor(1., dtype=xk.dtype)
const_bool_false = _to_tensor(False)
const_int_zero = _to_tensor(0)
const_int_one = _to_tensor(1)
if old_fval is None or gfk is None:
nfev, ngev = const_int_one, const_int_one
phi_0, g_0, dphi_0 = fval_and_grad(const_float_zero)
else:
nfev, ngev = const_int_zero, const_int_zero
phi_0, g_0 = old_fval, gfk
dphi_0 = mnp.dot(g_0, pk)
if old_old_fval is None:
start_value = const_float_one
else:
old_phi0 = old_old_fval
candidate_start_value = 1.01 * 2 * (phi_0 - old_phi0) / dphi_0
start_value = mnp.where(
mnp.isfinite(candidate_start_value),
mnp.minimum(candidate_start_value, const_float_one),
const_float_one
)
state = {
"done": const_bool_false,
"failed": const_bool_false,
"i": const_int_one,
"a_i": const_float_zero,
"phi_i": phi_0,
"dphi_i": dphi_0,
"nfev": nfev,
"ngev": ngev,
"a_star": const_float_zero,
"phi_star": phi_0,
"dphi_star": dphi_0,
"g_star": g_0,
}
while mnp.logical_not(state["done"]) and state["i"] <= maxiter:
a_i = mnp.where(state["i"] > 1, state["a_i"] * 2.0, start_value)
phi_i, g_i, dphi_i = fval_and_grad(a_i)
state["nfev"] += 1
state["ngev"] += 1
# Armijo condition
cond1 = (phi_i > phi_0 + c1 * a_i * dphi_0) or \
(phi_i >= state["phi_i"] and state["i"] > 1)
zoom1 = _zoom(fval_and_grad, state["a_i"], state["phi_i"], state["dphi_i"],
a_i, phi_i, dphi_i, phi_0, g_0, dphi_0, c1, c2, cond1)
state["nfev"] += zoom1["nfev"]
state["ngev"] += zoom1["ngev"]
state["done"] = cond1
state["failed"] = cond1 and zoom1["failed"]
state["a_star"] = mnp.where(cond1, zoom1["a_star"], state["a_star"])
state["phi_star"] = mnp.where(cond1, zoom1["phi_star"], state["phi_star"])
state["g_star"] = mnp.where(cond1, zoom1["g_star"], state["g_star"])
state["dphi_star"] = mnp.where(cond1, zoom1["dphi_star"], state["dphi_star"])
# Curvature condition
cond2 = mnp.logical_not(cond1) and mnp.abs(dphi_i) <= -c2 * dphi_0
state["done"] = state["done"] or cond2
state["a_star"] = mnp.where(cond2, a_i, state["a_star"])
state["phi_star"] = mnp.where(cond2, phi_i, state["phi_star"])
state["g_star"] = mnp.where(cond2, g_i, state["g_star"])
state["dphi_star"] = mnp.where(cond2, dphi_i, state["dphi_star"])
# Satisfying the strong wolf conditions
cond3 = mnp.logical_not(cond1) and mnp.logical_not(cond2) and dphi_i >= 0.
zoom2 = _zoom(fval_and_grad, a_i, phi_i, dphi_i, state["a_i"], state["phi_i"],
state["dphi_i"], phi_0, g_0, dphi_0, c1, c2, cond3)
state["nfev"] += zoom2["nfev"]
state["ngev"] += zoom2["ngev"]
state["done"] = state["done"] or cond3
state["failed"] = state["failed"] or (cond3 and zoom2["failed"])
state["a_star"] = mnp.where(cond3, zoom2["a_star"], state["a_star"])
state["phi_star"] = mnp.where(cond3, zoom2["phi_star"], state["phi_star"])
state["g_star"] = mnp.where(cond3, zoom2["g_star"], state["g_star"])
state["dphi_star"] = mnp.where(cond3, zoom2["dphi_star"], state["dphi_star"])
state["i"] += 1
state["a_i"] = a_i
state["phi_i"] = phi_i
state["dphi_i"] = dphi_i
state["status"] = mnp.where(
state["failed"],
1, # zoom failed
mnp.where(
state["i"] > maxiter,
3, # maxiter reached
0, # passed (should be)
),
)
state["a_star"] = mnp.where(
_to_tensor(state["a_star"].dtype != mstype.float64)
and (mnp.abs(state["a_star"]) < 1e-8),
mnp.sign(state["a_star"]) * 1e-8,
state["a_star"],
)
return state
[docs]def line_search(f, xk, pk, jac=None, gfk=None, old_fval=None, old_old_fval=None, c1=1e-4, c2=0.9, maxiter=20):
"""Inexact line search that satisfies strong Wolfe conditions.
Algorithm 3.5 from Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-61
Note:
`line_search` is not supported on Windows platform yet.
Args:
f (function): function of the form f(x) where x is a flat Tensor and returns a real
scalar. The function should be composed of operations with vjp defined.
gf (function): the gradient function at x where x is a flat Tensor and returns a Tensor.
The function can be None if you want to use automatic credits.
xk (Tensor): initial guess.
pk (Tensor): direction to search in. Assumes the direction is a descent direction.
gfk (Tensor): initial value of value_and_gradient as position. Default: None.
old_fval (Tensor): The same as `gfk`. Default: None.
old_old_fval (Tensor): unused argument, only for scipy API compliance. Default: None.
c1 (float): Wolfe criteria constant, see ref. Default: 1e-4.
c2 (float): The same as `c1`. Default: 0.9.
maxiter (int): maximum number of iterations to search. Default: 20.
Returns:
LineSearchResults, results of line search results.
Supported Platforms:
``CPU`` ``GPU``
Examples:
>>> import numpy as onp
>>> from mindspore.scipy.optimize import line_search
>>> from mindspore.common import Tensor
>>> x0 = Tensor(onp.ones(2).astype(onp.float32))
>>> p0 = Tensor(onp.array([-1, -1]).astype(onp.float32))
>>> def func(x):
>>> return x[0] ** 2 - x[1] ** 3
>>> res = line_search(func, x0, p0)
>>> print(res.a_k)
1.0
"""
if jac is None:
jac = grad(f)
state = LineSearch(f, jac)(xk, pk, old_fval, old_old_fval, gfk, c1, c2, maxiter)
# If running in graph mode, the state is a tuple.
if isinstance(state, tuple):
state = _LineSearchResults(failed=_to_scalar(state[1] or not state[0]),
nit=_to_scalar(state[2] - 1),
nfev=_to_scalar(state[6]),
ngev=_to_scalar(state[7]),
k=_to_scalar(state[2]),
a_k=_to_scalar(state[8]),
f_k=_to_scalar(state[9]),
g_k=state[11],
status=_to_scalar(state[12]))
else:
state = _LineSearchResults(failed=_to_scalar(state["failed"] or not state["done"]),
nit=_to_scalar(state["i"] - 1),
nfev=_to_scalar(state["nfev"]),
ngev=_to_scalar(state["ngev"]),
k=_to_scalar(state["i"]),
a_k=_to_scalar(state["a_star"]),
f_k=_to_scalar(state["phi_star"]),
g_k=state["g_star"],
status=_to_scalar(state["status"]))
return state