代码之家  ›  专栏  ›  技术社区  ›  j4nw

这种多层感知器反向传播实现有什么问题?

  •  0
  • j4nw  · 技术社区  · 6 年前

    我一直在试着直接翻译 Wikipedia article 关于算法,以及查看大量其他资源,但我一直无法使反向传播正常工作。因为我相信这是一个错误使用/误解一些numpy操作的问题,而不是算法本身,但也许不是。

    这是一个完整的可运行程序:

    import numpy as np
    
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])    
    y = np.array([0, 1, 1, 0])
    
    def activation(x): # sigmoid
        return 1 / (1 + np.exp(-x))
    
    def activation_d(x): # sigmoid derivative
        s = activation(x)
        return s * (1 - s)
    
    def cost(y1, y2):
        return (np.linalg.norm(y1 - y2) ** 2) / 2
    
    def mlp_train(X, y, n_h, learning_rate=1e-2, max_iterations=10000):
        n_i = 1 if len(X.shape) == 1 else len(X[0]) # input neurons count
        n_o = 1 if len(y.shape) == 1 else len(y[0]) # output neurons count
        h_layer = [np.random.rand(n_i) for i in range(n_h)]
        o_layer = [np.random.rand(n_h) for i in range(n_o)]
    
        for iteration in range(max_iterations):
            if (iteration % 2000 == 0): print('iteration', iteration)
            for j in range(len(X)):
                x = X[j]
                h = [activation(np.dot(x, n)) for n in h_layer]
                o = [activation(np.dot(h, n)) for n in o_layer]
                o = np.array(o)
                c = cost(o, np.array(y[j]))
                a_d = activation_d(x)
                o_grad = c * a_d
                o_delta = learning_rate * o_grad * h
                o_layer += o_delta
                h_grad = a_d * np.dot(o_delta, o_layer.T)
                h_delta = learning_rate * h_grad * x
                h_layer += h_delta
                if (iteration % 2000 == 0): print(x, '->', o, 'cost', c)
    
    mlp_train(X, y, n_h=2)
    

    成本根本没有最小化,所有的输出都收敛到0:

    iteration 0
    [0 0] -> [0.70755] cost 0.25031025599858575
    [0 1] -> [0.74962] cost 0.031344966778714914
    [1 0] -> [0.7546] cost 0.030109871312169207
    [1 1] -> [0.78708] cost 0.30974627646512554
    iteration 2000
    [0 0] -> [0.9568] cost 0.45773097730807827
    [0 1] -> [0.97965] cost 0.00020711262741391742
    [1 0] -> [0.98117] cost 0.00017728427582410072
    [1 1] -> [0.99024] cost 0.4902891867523237
    iteration 4000
    [0 0] -> [0.99691] cost 0.49691698274069973
    [0 1] -> [0.99932] cost 2.28713104303751e-07
    [1 0] -> [0.99941] cost 1.7196664273121246e-07
    [1 1] -> [0.99984] cost 0.4998383598602833
    iteration 6000
    [0 0] -> [0.9998] cost 0.49980132025306195
    [0 1] -> [0.99998] cost 1.587647769449268e-10
    [1 0] -> [0.99999] cost 1.0506374041454625e-10
    [1 1] -> [1.] cost 0.49999803556631833
    iteration 8000
    [0 0] -> [0.99999] cost 0.49998771962960453
    [0 1] -> [1.] cost 6.768208468242089e-14
    [1 0] -> [1.] cost 3.953230074403547e-14
    [1 1] -> [1.] cost 0.49999998306930477
    
    1 回复  |  直到 6 年前
        1
  •  1
  •   BugKiller    6 年前

    这是一个重新实现,但重写优化,似乎你没有掌握的关键 backpropagation Neural Networks and Deep Learning

    import numpy as np
    
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
    y = np.array([0, 1, 1, 0])
    
    
    def activation(x):  # sigmoid
        return 1 / (1 + np.exp(-x))
    
    
    def activation_d(x):  # sigmoid derivative
        s = activation(x)
        return s * (1 - s)
    
    
    def cost(y1, y2):
        return (np.linalg.norm(y1 - y2) ** 2) / 2
    
    def cost_d(y1, y2):
        """ Compute MSE loss, gradient
    
        Args:
            y1: prediction
            y2: target
    
        Returns:
            grads: same shape with y1 / y2
        """
        # d_sigmoid = sigmoid * (1 - sigmoid)
        return np.array(y1 - y2) * (y1 * (1 - y1))
    
    
    def mlp_train(X, y, n_h, learning_rate=1e-2, max_iterations=10000):
        n_i = 1 if len(X.shape) == 1 else len(X[0])  # input neurons count
        n_o = 1 if len(y.shape) == 1 else len(y[0])  # output neurons count
        # add bias
        hw = np.random.randn(n_i, n_h)
        hb = np.zeros((1, n_h))
        ow = np.random.randn(n_h, n_o)
        ob = np.zeros((1, n_o))
        for iteration in range(max_iterations):
            if iteration % 2000 == 0:
                print('iteration', iteration)
            for x_i, y_i in zip(X, y):
                # forwardprop
                x_i = x_i[np.newaxis, :]
                hz = np.dot(x_i, hw) + hb   # (1, n_h)
                ho = activation(hz)
    
                oz = np.dot(ho, ow) + ob    # (1, n_o)
                oo = activation(oz)
    
                # cost
                c = cost(oo, y_i)
    
                # backwardprop
                grad_oz = cost_d(oo, y_i)   # (1, n_o)
                grad_ob = grad_oz
                grad_ow = np.dot(ho.T, grad_oz) # (n_h, n_o)
                # update
                ow -= learning_rate * grad_ow
                ob -= learning_rate * grad_ob
    
                grad_h = np.dot(grad_oz, ow.T)  # (1, n_h)
                grad_hz = grad_h * (ho * (1 - ho))
                grad_hb = grad_hz       # (1, n_h)
                grad_hw = np.dot(x_i.T, grad_hz)    # (n_i, n_h)
                # update
                hw -= learning_rate * grad_hw
                hb -= learning_rate * grad_hb
    
                if iteration % 2000 == 0:
                    print(x_i, '->', oo, 'cost', c)
    
    
    mlp_train(X, y, n_h=2, max_iterations=int(1e5))
    

    输出:

    ...
    [[1 0]] -> [[0.94364581]] cost 0.0015878973434031022
    [[1 1]] -> [[0.04349045]] cost 0.0009457095065652823
    iteration 96000
    [[0 0]] -> [[0.04870092]] cost 0.0011858898326805463
    [[0 1]] -> [[0.95518092]] cost 0.0010043748998508786
    [[1 0]] -> [[0.94458789]] cost 0.001535251186790804
    [[1 1]] -> [[0.04277648]] cost 0.0009149137866793687
    iteration 98000
    [[0 0]] -> [[0.04791496]] cost 0.0011479218121198723
    [[0 1]] -> [[0.95588406]] cost 0.0009731082050768009
    [[1 0]] -> [[0.94548701]] cost 0.0014858330062528543
    [[1 1]] -> [[0.04209458]] cost 0.0008859767334115659