/* * MIT License * * Copyright (c) 2019 Alexey Edelev , Tatyana Borisova * * This file is part of NeuralNetwork project https://git.semlanik.org/semlanik/NeuralNetwork * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and * to permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package neuralnetwork import ( "encoding/binary" "errors" "fmt" "io" "os" "runtime" "sync" "time" training "git.semlanik.org/semlanik/NeuralNetwork/training" mat "gonum.org/v1/gonum/mat" ) // NeuralNetwork is artificial neural network implementation // // Resources: // http://neuralnetworksanddeeplearning.com // https://www.youtube.com/watch?v=fNk_zzaMoSs // http://www.inf.fu-berlin.de/lehre/WS06/Musterererkennung/Paper/rprop.pdf // // Matrix: A (local matrices used in forward and backward methods) // Description: A is set of calculated neuron activations after sigmoid correction // Format: 0 l L // ⎡A[0] ⎤ ... ⎡A[0] ⎤ ... ⎡A[0] ⎤ // ⎢A[1] ⎥ ... ⎢A[1] ⎥ ... ⎢A[1] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎢A[i] ⎥ ... ⎢A[i] ⎥ ... ⎢A[i] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎣A[s] ⎦ ... ⎣A[s] ⎦ ... ⎣A[s] ⎦ // Where s = Sizes[l] - Neural network layer size // L = len(Sizes) - Number of neural network layers // // Matrix: Z (local matrices used in forward and backward methods) // Description: Z is set of calculated raw neuron activations // Format: 0 l L // ⎡Z[0] ⎤ ... ⎡Z[0] ⎤ ... ⎡Z[0] ⎤ // ⎢Z[1] ⎥ ... ⎢Z[1] ⎥ ... ⎢Z[1] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎢Z[i] ⎥ ... ⎢Z[i] ⎥ ... ⎢Z[i] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎣Z[s] ⎦ ... ⎣Z[s] ⎦ ... ⎣Z[s] ⎦ // Where s = Sizes[l] - Neural network layer size // L = len(Sizes) - Number of neural network layers // // Matrix: Biases // Description: Biases is set of biases per layer except l0 // NOTE: l0 is always empty Dense because first layer // doesn't have connections to previous layer // Format: 1 l L // ⎡b[0] ⎤ ... ⎡b[0] ⎤ ... ⎡b[0] ⎤ // ⎢b[1] ⎥ ... ⎢b[1] ⎥ ... ⎢b[1] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎢b[i] ⎥ ... ⎢b[i] ⎥ ... ⎢b[i] ⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎣b[s] ⎦ ... ⎣b[s] ⎦ ... ⎣b[s] ⎦ // Where s = Sizes[l] - Neural network layer size // L = len(Sizes) - Number of neural network layers // // Matrix: Weights // Description: Weights is set of weights per layer except l0 // NOTE: l0 is always empty Dense because first layer // doesn't have connections to previous layer // Format: 1 l L // ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ ... ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ ... ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ // ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ ... ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ ... ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ ... ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ ... ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ // ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥ // ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ ... ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ ... ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ // Where s = Sizes[l] - Neural network layer size // s' = Sizes[l-1] - Previous neural network layer size // L = len(Sizes) - Number of neural network layers type NeuralNetwork struct { LayerCount int Sizes []int Biases []*mat.Dense Weights []*mat.Dense BGradient []interface{} WGradient []interface{} gradientDescentInitializer GradientDescentInitializer watcher StateWatcher syncMutex *sync.Mutex } // NewNeuralNetwork construction method that initializes new NeuralNetwork based // on provided list of layer sizes and GradientDescentInitializer that used for // backpropagation mechanism. // If gradientDescentInitializer is not provided (is nil) backpropagation won't // be possible. Common usecase when it's used is natural selection and genetic // training. func NewNeuralNetwork(sizes []int, gradientDescentInitializer GradientDescentInitializer) (nn *NeuralNetwork, err error) { err = nil if len(sizes) < 3 { fmt.Printf("Invalid network configuration: %v\n", sizes) return nil, errors.New("Invalid network configuration: %v\n") } for i := 0; i < len(sizes); i++ { if sizes[i] < 2 { fmt.Printf("Invalid network configuration: %v\n", sizes) return nil, errors.New("Invalid network configuration: %v\n") } } lenSizes := len(sizes) nn = &NeuralNetwork{ Sizes: sizes, LayerCount: len(sizes), Biases: make([]*mat.Dense, lenSizes), Weights: make([]*mat.Dense, lenSizes), BGradient: make([]interface{}, lenSizes), WGradient: make([]interface{}, lenSizes), gradientDescentInitializer: gradientDescentInitializer, syncMutex: &sync.Mutex{}, } for l := 1; l < nn.LayerCount; l++ { nn.Biases[l] = generateRandomDense(nn.Sizes[l], 1) nn.Weights[l] = generateRandomDense(nn.Sizes[l], nn.Sizes[l-1]) if nn.gradientDescentInitializer != nil { nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient) nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient) } } return } // Copy makes complete copy of NeuralNetwork data. Output network has the same // weights and biases values and but might be used independend of original one, // e.g. in separate goroutine func (nn *NeuralNetwork) Copy() (outNN *NeuralNetwork) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() outNN = &NeuralNetwork{ Sizes: nn.Sizes, LayerCount: len(nn.Sizes), Biases: make([]*mat.Dense, nn.LayerCount), Weights: make([]*mat.Dense, nn.LayerCount), BGradient: make([]interface{}, nn.LayerCount), WGradient: make([]interface{}, nn.LayerCount), gradientDescentInitializer: nn.gradientDescentInitializer, watcher: nn.watcher, syncMutex: &sync.Mutex{}, } for l := 1; l < outNN.LayerCount; l++ { outNN.Biases[l] = mat.DenseCopyOf(nn.Biases[l]) outNN.Weights[l] = mat.DenseCopyOf(nn.Weights[l]) if outNN.gradientDescentInitializer != nil { outNN.BGradient[l] = outNN.gradientDescentInitializer(outNN, l, BiasGradient) outNN.WGradient[l] = outNN.gradientDescentInitializer(outNN, l, WeightGradient) } } return } // Reset resets network state to intial/random one with specified in argument // layers configuration func (nn *NeuralNetwork) Reset(sizes []int) (err error) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() err = nil if len(sizes) < 3 { fmt.Printf("Invalid network configuration: %v\n", sizes) return errors.New("Invalid network configuration: %v\n") } for i := 0; i < len(sizes); i++ { if sizes[i] < 2 { fmt.Printf("Invalid network configuration: %v\n", sizes) return errors.New("Invalid network configuration: %v\n") } } lenSizes := len(sizes) nn.Sizes = sizes nn.LayerCount = len(sizes) nn.Biases = make([]*mat.Dense, lenSizes) nn.Weights = make([]*mat.Dense, lenSizes) nn.BGradient = make([]interface{}, lenSizes) nn.WGradient = make([]interface{}, lenSizes) for l := 1; l < nn.LayerCount; l++ { nn.Biases[l] = generateRandomDense(nn.Sizes[l], 1) nn.Weights[l] = generateRandomDense(nn.Sizes[l], nn.Sizes[l-1]) if nn.gradientDescentInitializer != nil { nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient) nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient) } } return } // SetStateWatcher setups state watcher for NeuralNetwork. StateWatcher is common // interface that collects data about NeuralNetwork behavior. If not specified (is // set to nil) NeuralNetwork will ignore StateWatcher interations func (nn *NeuralNetwork) SetStateWatcher(watcher StateWatcher) { nn.watcher = watcher if watcher != nil { watcher.Init(nn) if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) { watcher.UpdateState(StateIdle) } } } // Predict method invokes prediction based on input activations provided in argument. // Returns index of best element in output activation matrix and its value func (nn *NeuralNetwork) Predict(aIn mat.Matrix) (maxIndex int, max float64) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) { nn.watcher.UpdateState(StatePredict) defer nn.watcher.UpdateState(StateIdle) } } r, _ := aIn.Dims() if r != nn.Sizes[0] { fmt.Printf("Invalid rows number of input matrix size: %v\n", r) return -1, 0.0 } A, _ := nn.forward(aIn) result := A[nn.LayerCount-1] r, _ = result.Dims() max = 0.0 maxIndex = 0 for i := 0; i < r; i++ { if result.At(i, 0) > max { max = result.At(i, 0) maxIndex = i } } return } // Validate runs basic network validation/verification based on validation data that // provided by training.Trainer passed as argument. // Returns count of failure predictions and total amount of verified samples. func (nn *NeuralNetwork) Validate(trainer training.Trainer) (failCount, total int) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() failCount = 0 total = trainer.ValidatorCount() for i := 0; i < trainer.ValidatorCount(); i++ { dataSet, expect := trainer.GetValidator(i) index, _ := nn.Predict(dataSet) if expect.At(index, 0) != 1.0 { failCount++ } } if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(ValidationSubscription) { nn.watcher.UpdateValidation(total, failCount) } } return } // Train is common training function that invokes one of training methods depends on // gradient descent used buy NeuralNetwork. training.Trainer passed as argument used // to get training data. Training loops are limited buy number of epocs func (nn *NeuralNetwork) Train(trainer training.Trainer, epocs int) { if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) { nn.watcher.UpdateState(StateLearning) defer nn.watcher.UpdateState(StateIdle) } } if _, ok := nn.WGradient[nn.LayerCount-1].(OnlineGradientDescent); ok { nn.trainOnline(trainer, epocs) } else if _, ok := nn.WGradient[nn.LayerCount-1].(BatchGradientDescent); ok { nn.trainBatch(trainer, epocs) } else { panic("Invalid gradient descent type") } } func (nn *NeuralNetwork) trainOnline(trainer training.Trainer, epocs int) { for t := 0; t < epocs; t++ { for i := 0; i < trainer.DataCount(); i++ { if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(TrainingSubscription) { nn.watcher.UpdateTraining(t, epocs, i, trainer.DataCount()) } } nn.syncMutex.Lock() dB, dW := nn.backward(trainer.GetData(i)) for l := 1; l < nn.LayerCount; l++ { bGradient, ok := nn.BGradient[l].(OnlineGradientDescent) if !ok { panic("bGradient is not a OnlineGradientDescent") } wGradient, ok := nn.WGradient[l].(OnlineGradientDescent) if !ok { panic("wGradient is not a OnlineGradientDescent") } nn.Biases[l] = bGradient.ApplyDelta(nn.Biases[l], dB[l]) nn.Weights[l] = wGradient.ApplyDelta(nn.Weights[l], dW[l]) if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(BiasesSubscription) { nn.watcher.UpdateBiases(l, mat.DenseCopyOf(nn.Biases[l])) } if nn.watcher.GetSubscriptionFeatures().Has(WeightsSubscription) { nn.watcher.UpdateWeights(l, mat.DenseCopyOf(nn.Weights[l])) } } } nn.syncMutex.Unlock() } } } func (nn *NeuralNetwork) trainBatch(trainer training.Trainer, epocs int) { fmt.Printf("Start training in %v threads\n", runtime.NumCPU()) for t := 0; t < epocs; t++ { if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(TrainingSubscription) { nn.watcher.UpdateTraining(t, epocs, 0, trainer.DataCount()) } } batchWorkers := nn.runBatchWorkers(runtime.NumCPU(), trainer) nn.syncMutex.Lock() for l := 1; l < nn.LayerCount; l++ { bGradient, ok := nn.BGradient[l].(BatchGradientDescent) if !ok { panic("bGradient is not a BatchGradientDescent") } wGradient, ok := nn.WGradient[l].(BatchGradientDescent) if !ok { panic("wGradient is not a BatchGradientDescent") } for _, bw := range batchWorkers { dB, dW := bw.result(l) bGradient.AccumGradients(dB) wGradient.AccumGradients(dW) } nn.Biases[l] = bGradient.ApplyDelta(nn.Biases[l]) nn.Weights[l] = wGradient.ApplyDelta(nn.Weights[l]) if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(BiasesSubscription) { nn.watcher.UpdateBiases(l, mat.DenseCopyOf(nn.Biases[l])) } if nn.watcher.GetSubscriptionFeatures().Has(WeightsSubscription) { nn.watcher.UpdateWeights(l, mat.DenseCopyOf(nn.Weights[l])) } } } nn.syncMutex.Unlock() //TODO: remove this is not used for visualization time.Sleep(100 * time.Millisecond) } } func (nn *NeuralNetwork) runBatchWorkers(threadCount int, trainer training.Trainer) (workers []*batchWorker) { wg := sync.WaitGroup{} chunkSize := trainer.DataCount() / threadCount workers = make([]*batchWorker, threadCount) for i, _ := range workers { workers[i] = newBatchWorker(nn) wg.Add(1) s := i go func() { workers[s].run(trainer, s*chunkSize, (s+1)*chunkSize) wg.Done() }() } wg.Wait() return } // SaveState saves state of NeuralNetwork to io.Writer. It's usefull to keep training results // between NeuralNetwork "power cycles" or to share traing results between clustered neural // network hosts func (nn *NeuralNetwork) SaveState(writer io.Writer) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() //save input array count bufferSize := make([]byte, 4) binary.LittleEndian.PutUint32(bufferSize[0:], uint32(nn.LayerCount)) _, err := writer.Write(bufferSize) check(err) //fmt.Printf("wrote value %d\n", uint32(nn.LayerCount)) // save an input array buffer := make([]byte, nn.LayerCount*4) for i := 0; i < nn.LayerCount; i++ { binary.LittleEndian.PutUint32(buffer[i*4:], uint32(nn.Sizes[i])) } _, err = writer.Write(buffer) check(err) // fmt.Printf("wrote buffer %d bytes\n", n2) //save biases for i := 1; i < nn.LayerCount; i++ { saveDense(writer, nn.Biases[i]) } //save weights for i := 1; i < nn.LayerCount; i++ { saveDense(writer, nn.Weights[i]) } } // SaveStateToFile saves NeuralNetwork state to file by specific filePath func (nn *NeuralNetwork) SaveStateToFile(filePath string) { outFile, err := os.OpenFile(filePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err) defer outFile.Close() nn.SaveState(outFile) } // LoadState loads NeuralNetwork state from io.Reader. All existing data in NeuralNetwork // will be rewritten buy this method, including layers configuration and weights and biases func (nn *NeuralNetwork) LoadState(reader io.Reader) { nn.syncMutex.Lock() defer nn.syncMutex.Unlock() // Reade count nn.LayerCount = readInt(reader) // Read an input array sizeBuffer := readByteArray(reader, nn.LayerCount*4) nn.Sizes = make([]int, nn.LayerCount) for l := 0; l < nn.LayerCount; l++ { nn.Sizes[l] = int(binary.LittleEndian.Uint32(sizeBuffer[l*4:])) fmt.Printf("LoadState: nn.Sizes[%d] %d \n", l, nn.Sizes[l]) } nn.Weights = []*mat.Dense{&mat.Dense{}} nn.Biases = []*mat.Dense{&mat.Dense{}} // read Biases nn.Biases[0] = &mat.Dense{} for l := 1; l < nn.LayerCount; l++ { nn.Biases = append(nn.Biases, &mat.Dense{}) nn.Biases[l] = readDense(reader, nn.Biases[l]) } // read Weights and initialize gradient descents nn.BGradient = make([]interface{}, nn.LayerCount) nn.WGradient = make([]interface{}, nn.LayerCount) nn.Weights[0] = &mat.Dense{} for l := 1; l < nn.LayerCount; l++ { nn.Weights = append(nn.Weights, &mat.Dense{}) nn.Weights[l] = readDense(reader, nn.Weights[l]) if nn.gradientDescentInitializer != nil { nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient) nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient) } } // fmt.Printf("\nLoadState end\n") } // LoadStateFromFile loads NeuralNetwork state from file by specific filePath func (nn *NeuralNetwork) LoadStateFromFile(filePath string) { inFile, err := os.Open(filePath) check(err) defer inFile.Close() nn.LoadState(inFile) } func (nn NeuralNetwork) forward(aIn mat.Matrix) (A, Z []*mat.Dense) { A = make([]*mat.Dense, nn.LayerCount) Z = make([]*mat.Dense, nn.LayerCount) A[0] = mat.DenseCopyOf(aIn) if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(ActivationsSubscription) { nn.watcher.UpdateActivations(0, mat.DenseCopyOf(A[0])) } } for l := 1; l < nn.LayerCount; l++ { A[l] = mat.NewDense(nn.Sizes[l], 1, nil) aSrc := A[l-1] aDst := A[l] // Each iteration implements formula bellow for neuron activation values // A[l]=σ(W[l]*A[l−1]+B[l]) // W[l]*A[l−1] aDst.Mul(nn.Weights[l], aSrc) // W[l]*A[l−1]+B[l] aDst.Add(aDst, nn.Biases[l]) // Save raw activation value for back propagation Z[l] = mat.DenseCopyOf(aDst) // σ(W[l]*A[l−1]+B[l]) aDst.Apply(applySigmoid, aDst) if nn.watcher != nil { if nn.watcher.GetSubscriptionFeatures().Has(ActivationsSubscription) { nn.watcher.UpdateActivations(l, mat.DenseCopyOf(aDst)) } } } return } // Function returns calculated bias and weights derivatives for each // layer arround aIn/aOut datasets func (nn NeuralNetwork) backward(aIn, aOut mat.Matrix) (dB, dW []*mat.Dense) { A, Z := nn.forward(aIn) lastLayerNum := nn.LayerCount - 1 dB = make([]*mat.Dense, nn.LayerCount) dW = make([]*mat.Dense, nn.LayerCount) // To calculate new values of weights and biases // following formulas are used: // ∂E/∂W[l] = A[l−1]*δ[l] // ∂E/∂B[l] = δ[l] // For last layer δ value is calculated by following: // δ = (A[L]−y)⊙σ'(Z[L]) // Calculate initial error for last layer L // error = A[L]-y // Where y is expected activations set err := &mat.Dense{} err.Sub(A[nn.LayerCount-1], aOut) // Calculate sigmoids prime σ'(Z[L]) for last layer L sigmoidsPrime := &mat.Dense{} sigmoidsPrime.Apply(applySigmoidPrime, Z[lastLayerNum]) // (A[L]−y)⊙σ'(Z[L]) delta := &mat.Dense{} delta.MulElem(err, sigmoidsPrime) // ∂E/∂B[L] = δ[L] biases := mat.DenseCopyOf(delta) // ∂E/∂W[L] = A[L−1]*δ[L] weights := &mat.Dense{} weights.Mul(delta, A[lastLayerNum-1].T()) // Initialize new weights and biases values with last layer values dB[lastLayerNum] = biases dW[lastLayerNum] = weights // Next layer derivatives of Weights and Biases are calculated using same formulas: // ∂E/∂W[l] = A[l−1]*δ[l] // ∂E/∂B[l] = δ[l] // But δ[l] is calculated using different formula: // δ[l] = ((Wt[l+1])*δ[l+1])⊙σ'(Z[l]) // Where Wt[l+1] is transposed matrix of actual Weights from // forward step for l := nn.LayerCount - 2; l > 0; l-- { // Calculate sigmoids prime σ'(Z[l]) for last layer l sigmoidsPrime := &mat.Dense{} sigmoidsPrime.Apply(applySigmoidPrime, Z[l]) // (Wt[l+1])*δ[l+1] // err bellow is delta from previous step(l+1) wdelta := &mat.Dense{} wdelta.Mul(nn.Weights[l+1].T(), delta) // Calculate new delta and store it to temporary variable err // δ[l] = ((Wt[l+1])*δ[l+1])⊙σ'(Z[l]) delta = &mat.Dense{} delta.MulElem(wdelta, sigmoidsPrime) // ∂E/∂B[l] = δ[l] biases := mat.DenseCopyOf(delta) // ∂E/∂W[l] = A[l−1]*δ[l] // At this point it's required to give explanation for inaccuracy // in the formula // Multiplying of activations matrix for layer l-1 and δ[l] is imposible // because view of matrices are following: // A[l-1] δ[l] // ⎡A[0] ⎤ ⎡δ[0] ⎤ // ⎢A[1] ⎥ ⎢δ[1] ⎥ // ⎢ ... ⎥ ⎢ ... ⎥ // ⎢A[i] ⎥ X ⎢δ[i] ⎥ // ⎢ ... ⎥ ⎢ ... ⎥ // ⎣A[s'] ⎦ ⎣δ[s] ⎦ // So we need to modify these matrices to apply mutiplications and got // Weights matrix of following view: // ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ // ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ // ⎢ ... ⎥ // ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ // ⎢ ... ⎥ // ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ // So we swap matrices and transpose A[l-1] to get valid multiplication // of following view: // δ[l] A[l-1] // ⎡δ[0] ⎤ x [A[0] A[1] ... A[i] ... A[s']] // ⎢δ[1] ⎥ // ⎢ ... ⎥ // ⎢δ[i] ⎥ // ⎢ ... ⎥ // ⎣δ[s] ⎦ weights := &mat.Dense{} weights.Mul(delta, A[l-1].T()) dB[l] = biases dW[l] = weights } return }