Source code for paramz.optimization.scg

#===============================================================================
# Copyright (c) 1996-2014, I. Nabney, N.Lawrence and James Hensman (1996 - 2014)
# Copyright (c) 2015, Max Zwiessele
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
# 
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
# 
# * Neither the name of paramax nor the names of its
#   contributors may be used to endorse or promote products derived from
#   this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#===============================================================================

'''
Scaled Conjuagte Gradients, originally in Matlab as part of the 
Netlab toolbox by I. Nabney, converted to python N. Lawrence 
and given a pythonic interface by James Hensman.

Edited by Max Zwiessele for efficiency and verbose optimization.
'''

from __future__ import print_function
import numpy as np
import sys

[docs]def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, xtol=None, ftol=None, gtol=None):
    """
    Optimisation through Scaled Conjugate Gradients (SCG)

    f: the objective function
    gradf : the gradient function (should return a 1D np.ndarray)
    x : the initial condition

    Returns
    x the optimal value for x
    flog : a list of all the objective values
    function_eval number of fn evaluations
    status: string describing convergence status
    """
    if xtol is None:
        xtol = 1e-6
    if ftol is None:
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5

    sigma0 = 1.0e-7
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
    function_eval += 1
    #if any(np.isnan(gradnew)):
    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
    success = True # Force calculation of directional derivs.
    nsuccess = 0 # nsuccess counts number of successes.
    beta = 1.0 # Initial scale parameter.
    betamin = 1.0e-15 # Lower bound on scale.
    betamax = 1.0e15 # Upper bound on scale.
    status = "Not converged"

    flog = [fold]

    iteration = 0

    # Main optimization loop.
    while iteration < maxiters:

        # Calculate first and second directional derivatives.
        if success:
            mu = np.dot(d, gradnew)
            if mu >= 0:  # pragma: no cover
                d = -gradnew
                mu = np.dot(d, gradnew)
            kappa = np.dot(d, d)
            sigma = sigma0 / np.sqrt(kappa)
            xplus = x + sigma * d
            gplus = gradf(xplus, *optargs)
            function_eval += 1
            theta = np.dot(d, (gplus - gradnew)) / sigma

        # Increase effective curvature and evaluate step size alpha.
        delta = theta + beta * kappa
        if delta <= 0: # pragma: no cover
            delta = beta * kappa
            beta = beta - theta / kappa

        alpha = -mu / delta

        # Calculate the comparison ratio.
        xnew = x + alpha * d
        fnew = f(xnew, *optargs)
        function_eval += 1

        Delta = 2.*(fnew - fold) / (alpha * mu)
        if Delta >= 0.:
            success = True
            nsuccess += 1
            x = xnew
            fnow = fnew
        else:
            success = False
            fnow = fold

        # Store relevant variables
        flog.append(fnow) # Current function value

        iteration += 1

        if success:
            # Test for termination

            if (np.abs(fnew - fold) < ftol):
                status = 'converged - relative reduction in objective'
                break
#                 return x, flog, function_eval, status
            elif (np.max(np.abs(alpha * d)) < xtol):
                status = 'converged - relative stepsize'
                break
            else:
                # Update variables for new position
                gradold = gradnew
                gradnew = gradf(x, *optargs)
                function_eval += 1
                current_grad = np.dot(gradnew, gradnew)
                fold = fnew
                # If the gradient is zero then we are done.
                if current_grad <= gtol:
                    status = 'converged - relative reduction in gradient'
                    break
                    # return x, flog, function_eval, status

        # Adjust beta according to comparison ratio.
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
            beta = max(0.25 * beta, betamin)

        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
            beta = 1. # This is not in the original paper
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
            d = Gamma * d - gradnew
    else:
        # If we get here, then we haven't terminated in the given number of
        # iterations.
        status = "maxiter exceeded"

    return x, flog, function_eval, status