/*
 * MIT License
 *
 * Copyright (c) 2019 Alexey Edelev <semlanik@gmail.com>, Tatyana Borisova <tanusshhka@mail.ru>
 *
 * This file is part of NeuralNetwork project https://git.semlanik.org/semlanik/NeuralNetwork
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this
 * software and associated documentation files (the "Software"), to deal in the Software
 * without restriction, including without limitation the rights to use, copy, modify,
 * merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies
 * or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
 * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

package neuralnetwork

import (
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"log"
	"math"
	"os"
	"sync"
	"time"

	training "git.semlanik.org/semlanik/NeuralNetwork/training"
	mat "gonum.org/v1/gonum/mat"
)

// NeuralNetwork is artificial neural network implementation
//
// Resources:
// http://neuralnetworksanddeeplearning.com
// https://www.youtube.com/watch?v=fNk_zzaMoSs
// http://www.inf.fu-berlin.de/lehre/WS06/Musterererkennung/Paper/rprop.pdf
//
// Matrix: A (local matrices used in forward and backward methods)
// Description: A is set of calculated neuron activations after sigmoid correction
// Format:    0          l           L
//         ⎡A[0] ⎤ ... ⎡A[0] ⎤ ... ⎡A[0] ⎤
//         ⎢A[1] ⎥ ... ⎢A[1] ⎥ ... ⎢A[1] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎢A[i] ⎥ ... ⎢A[i] ⎥ ... ⎢A[i] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎣A[s] ⎦ ... ⎣A[s] ⎦ ... ⎣A[s] ⎦
// Where s = Sizes[l] - Neural network layer size
//       L = len(Sizes) - Number of neural network layers
//
// Matrix: Z (local matrices used in forward and backward methods)
// Description: Z is set of calculated raw neuron activations
// Format:    0          l           L
//         ⎡Z[0] ⎤ ... ⎡Z[0] ⎤ ... ⎡Z[0] ⎤
//         ⎢Z[1] ⎥ ... ⎢Z[1] ⎥ ... ⎢Z[1] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎢Z[i] ⎥ ... ⎢Z[i] ⎥ ... ⎢Z[i] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎣Z[s] ⎦ ... ⎣Z[s] ⎦ ... ⎣Z[s] ⎦
// Where s = Sizes[l] - Neural network layer size
//       L = len(Sizes) - Number of neural network layers
//
// Matrix: Biases
// Description: Biases is set of biases per layer except l0
//              NOTE: l0 is always empty Dense because first layer
//              doesn't have connections to previous layer
// Format:    1          l           L
//         ⎡b[0] ⎤ ... ⎡b[0] ⎤ ... ⎡b[0] ⎤
//         ⎢b[1] ⎥ ... ⎢b[1] ⎥ ... ⎢b[1] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎢b[i] ⎥ ... ⎢b[i] ⎥ ... ⎢b[i] ⎥
//         ⎢ ... ⎥ ... ⎢ ... ⎥ ... ⎢ ... ⎥
//         ⎣b[s] ⎦ ... ⎣b[s] ⎦ ... ⎣b[s] ⎦
// Where s = Sizes[l] - Neural network layer size
//       L = len(Sizes) - Number of neural network layers
//
// Matrix: Weights
// Description: Weights is set of weights per layer except l0
//              NOTE: l0 is always empty Dense because first layer
//              doesn't have connections to previous layer
// Format:               1                                   l                                   L
//         ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ ... ⎡w[0,0] ... w[0,j] ... w[0,s']⎤ ... ⎡w[0,0] ... w[0,j] ... w[0,s']⎤
//         ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ ... ⎢w[1,0] ... w[1,j] ... w[1,s']⎥ ... ⎢w[1,0] ... w[1,j] ... w[1,s']⎥
//         ⎢              ...            ⎥ ... ⎢              ...            ⎥ ... ⎢              ...            ⎥
//         ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ ... ⎢w[i,0] ... w[i,j] ... w[i,s']⎥ ... ⎢w[i,0] ... w[i,j] ... w[i,s']⎥
//         ⎢              ...            ⎥ ... ⎢              ...            ⎥ ... ⎢              ...            ⎥
//         ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ ... ⎣w[s,0] ... w[s,j] ... w[s,s']⎦ ... ⎣w[s,0] ... w[s,j] ... w[s,s']⎦
// Where s = Sizes[l] - Neural network layer size
//       s' = Sizes[l-1] - Previous neural network layer size
//       L = len(Sizes) - Number of neural network layers

type NeuralNetwork struct {
	LayerCount                 int
	Sizes                      []int
	Biases                     []*mat.Dense
	Weights                    []*mat.Dense
	BGradient                  []interface{}
	WGradient                  []interface{}
	gradientDescentInitializer GradientDescentInitializer
	watcher                    StateWatcher
	syncMutex                  *sync.Mutex
	batchWorkerFactory         BatchWorkerFactory
	earlyStop                  EarlyStop
}

// NewNeuralNetwork construction method that initializes new NeuralNetwork based
// on provided list of layer sizes and GradientDescentInitializer that used for
// backpropagation mechanism.
// If gradientDescentInitializer is not provided (is nil) backpropagation won't
// be possible. Common usecase when it's used is natural selection and genetic
// training.
func NewNeuralNetwork(sizes []int, gradientDescentInitializer GradientDescentInitializer) (nn *NeuralNetwork, err error) {
	err = nil
	if len(sizes) < 3 {
		fmt.Printf("Invalid network configuration: %v\n", sizes)
		return nil, errors.New("Invalid network configuration: %v\n")
	}

	for i := 0; i < len(sizes); i++ {
		if sizes[i] < 2 {
			fmt.Printf("Invalid network configuration: %v\n", sizes)
			return nil, errors.New("Invalid network configuration: %v\n")
		}
	}

	lenSizes := len(sizes)
	nn = &NeuralNetwork{
		Sizes:                      sizes,
		LayerCount:                 len(sizes),
		Biases:                     make([]*mat.Dense, lenSizes),
		Weights:                    make([]*mat.Dense, lenSizes),
		BGradient:                  make([]interface{}, lenSizes),
		WGradient:                  make([]interface{}, lenSizes),
		gradientDescentInitializer: gradientDescentInitializer,
		syncMutex:                  &sync.Mutex{},
		earlyStop:                  &noEarlyStop{},
	}

	for l := 1; l < nn.LayerCount; l++ {
		nn.Biases[l] = generateRandomDense(nn.Sizes[l], 1)
		nn.Weights[l] = generateRandomDense(nn.Sizes[l], nn.Sizes[l-1])
		if nn.gradientDescentInitializer != nil {
			nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient)
			nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient)
		}
	}
	return
}

// Copy makes complete copy of NeuralNetwork data. Output network has the same
// weights and biases values and but might be used independend of original one,
// e.g. in separate goroutine
func (nn *NeuralNetwork) Copy() (outNN *NeuralNetwork) {
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	outNN = &NeuralNetwork{
		Sizes:                      nn.Sizes,
		LayerCount:                 len(nn.Sizes),
		Biases:                     make([]*mat.Dense, nn.LayerCount),
		Weights:                    make([]*mat.Dense, nn.LayerCount),
		BGradient:                  make([]interface{}, nn.LayerCount),
		WGradient:                  make([]interface{}, nn.LayerCount),
		gradientDescentInitializer: nn.gradientDescentInitializer,
		watcher:                    nn.watcher,
		syncMutex:                  &sync.Mutex{},
		earlyStop:                  &noEarlyStop{},
	}
	for l := 1; l < outNN.LayerCount; l++ {
		outNN.Biases[l] = mat.DenseCopyOf(nn.Biases[l])
		outNN.Weights[l] = mat.DenseCopyOf(nn.Weights[l])
		if outNN.gradientDescentInitializer != nil {
			outNN.BGradient[l] = outNN.gradientDescentInitializer(outNN, l, BiasGradient)
			outNN.WGradient[l] = outNN.gradientDescentInitializer(outNN, l, WeightGradient)
		}
	}
	return
}

// SetBatchWorkerFactory setup batch worker factory for batch training. In case if
// factory is not setup localBatchWorkerFactory will be used.
func (nn *NeuralNetwork) SetBatchWorkerFactory(factory BatchWorkerFactory) {
	nn.batchWorkerFactory = factory
}

// SetEarlyStop setup early stop analyser to stop training before all training epocs finished.
// Usually early stop required to avoid overfitting in neural network.
func (nn *NeuralNetwork) SetEarlyStop(earlyStop EarlyStop) {
	nn.earlyStop = earlyStop
}

// Reset resets network state to intial/random one with specified in argument
// layers configuration.
func (nn *NeuralNetwork) Reset(sizes []int) (err error) {
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	err = nil
	if len(sizes) < 3 {
		fmt.Printf("Invalid network configuration: %v\n", sizes)
		return errors.New("Invalid network configuration: %v\n")
	}

	for i := 0; i < len(sizes); i++ {
		if sizes[i] < 2 {
			fmt.Printf("Invalid network configuration: %v\n", sizes)
			return errors.New("Invalid network configuration: %v\n")
		}
	}

	lenSizes := len(sizes)
	nn.Sizes = sizes
	nn.LayerCount = len(sizes)
	nn.Biases = make([]*mat.Dense, lenSizes)
	nn.Weights = make([]*mat.Dense, lenSizes)
	nn.BGradient = make([]interface{}, lenSizes)
	nn.WGradient = make([]interface{}, lenSizes)

	for l := 1; l < nn.LayerCount; l++ {
		nn.Biases[l] = generateRandomDense(nn.Sizes[l], 1)
		nn.Weights[l] = generateRandomDense(nn.Sizes[l], nn.Sizes[l-1])
		if nn.gradientDescentInitializer != nil {
			nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient)
			nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient)
		}
	}

	return
}

// SetStateWatcher setups state watcher for NeuralNetwork. StateWatcher is common
// interface that collects data about NeuralNetwork behavior. If not specified (is
// set to nil) NeuralNetwork will ignore StateWatcher interations.
func (nn *NeuralNetwork) SetStateWatcher(watcher StateWatcher) {
	nn.watcher = watcher
	if watcher != nil {
		watcher.Init(nn)
		if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) {
			watcher.UpdateState(StateIdle)
		}
	}
}

// Predict method invokes prediction based on input activations provided in argument.
// Returns index of best element in output activation matrix and its value.
func (nn *NeuralNetwork) Predict(aIn mat.Matrix) (maxIndex int, max float64) {
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	if nn.watcher != nil {
		if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) {
			nn.watcher.UpdateState(StatePredict)
			defer nn.watcher.UpdateState(StateIdle)
		}
	}
	r, _ := aIn.Dims()
	if r != nn.Sizes[0] {
		fmt.Printf("Invalid rows number of input matrix size: %v\n", r)
		return -1, 0.0
	}

	A, _ := nn.forward(aIn)
	result := A[nn.LayerCount-1]
	r, _ = result.Dims()
	max = 0.0
	maxIndex = 0
	for i := 0; i < r; i++ {
		if result.At(i, 0) > max {
			max = result.At(i, 0)
			maxIndex = i
		}
	}
	return
}

// Validate runs basic network validation/verification based on validation data that
// provided by training.Trainer passed as argument.
// Returns count of failure predictions and total amount of verified samples and mean square sum of errors for all samples
func (nn *NeuralNetwork) Validate(trainer training.Trainer) (squareError float64, failCount, total int) {
	failCount = 0
	squareError = 0.0
	total = trainer.ValidatorCount()
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	if nn.watcher != nil {
		if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) {
			nn.watcher.UpdateState(StateValidation)
			defer nn.watcher.UpdateState(StateIdle)
		}
	}
	for i := 0; i < trainer.ValidatorCount(); i++ {
		aIn, aOut := trainer.GetValidator(i)
		r, _ := aIn.Dims()
		if r != nn.Sizes[0] {
			fmt.Printf("Invalid rows number of input matrix size: %v\n", r)
			return math.MaxFloat64, total, total
		}

		A, _ := nn.forward(aIn)
		result := A[nn.LayerCount-1]
		r, _ = result.Dims()

		err := &mat.Dense{}
		err.Sub(result, aOut)

		var squareErrorLocal float64 = 0.0
		max := 0.0
		maxIndex := 0
		for i := 0; i < r; i++ {
			if result.At(i, 0) > max {
				max = result.At(i, 0)
				maxIndex = i
			}
			squareErrorLocal += err.At(i, 0) * err.At(i, 0)
		}
		if aOut.At(maxIndex, 0) != 1.0 {
			failCount++
		}
		squareError += squareErrorLocal / float64(r)
	}

	if nn.watcher != nil {
		if nn.watcher.GetSubscriptionFeatures().Has(ValidationSubscription) {
			nn.watcher.UpdateValidation(total, failCount)
		}
	}
	return
}

// Train is common training function that invokes one of training methods depends on
// gradient descent used buy NeuralNetwork. training.Trainer passed as argument used
// to get training data. Training loops are limited buy number of epocs.
func (nn *NeuralNetwork) Train(trainer training.Trainer, epocs int) {
	if nn.watcher != nil {
		if nn.watcher.GetSubscriptionFeatures().Has(StateSubscription) {
			nn.watcher.UpdateState(StateLearning)
			defer nn.watcher.UpdateState(StateIdle)
		}
	}

	if nn.earlyStop != nil {
		nn.earlyStop.Reset()
		nn.earlyStop.Test()
	}

	if _, ok := nn.WGradient[nn.LayerCount-1].(OnlineGradientDescent); ok {
		nn.trainOnline(trainer, epocs)
	} else if _, ok := nn.WGradient[nn.LayerCount-1].(BatchGradientDescent); ok {
		nn.trainBatch(trainer, epocs)
	} else {
		panic("Invalid gradient descent type")
	}
}

func (nn *NeuralNetwork) trainOnline(trainer training.Trainer, epocs int) {

	for t := 0; t < epocs; t++ {
		for i := 0; i < trainer.DataCount(); i++ {
			if nn.watcher != nil {
				if nn.watcher.GetSubscriptionFeatures().Has(TrainingSubscription) {
					nn.watcher.UpdateTraining(t, epocs, i, trainer.DataCount())
				}
			}
			nn.syncMutex.Lock()
			dB, dW := nn.backward(trainer.GetData(i))
			for l := 1; l < nn.LayerCount; l++ {
				bGradient, ok := nn.BGradient[l].(OnlineGradientDescent)
				if !ok {
					panic("bGradient is not a OnlineGradientDescent")
				}
				wGradient, ok := nn.WGradient[l].(OnlineGradientDescent)
				if !ok {
					panic("wGradient is not a OnlineGradientDescent")
				}
				nn.Biases[l] = bGradient.ApplyDelta(nn.Biases[l], dB[l])
				nn.Weights[l] = wGradient.ApplyDelta(nn.Weights[l], dW[l])
				if nn.watcher != nil {
					if nn.watcher.GetSubscriptionFeatures().Has(BiasesSubscription) {
						nn.watcher.UpdateBiases(l, mat.DenseCopyOf(nn.Biases[l]))
					}
					if nn.watcher.GetSubscriptionFeatures().Has(WeightsSubscription) {
						nn.watcher.UpdateWeights(l, mat.DenseCopyOf(nn.Weights[l]))
					}
				}
			}
			nn.syncMutex.Unlock()
		}

		if nn.earlyStop != nil && nn.earlyStop.Test() {
			log.Printf("Training stopped due to fail rate grow\n")
			break
		}
	}
}

func (nn *NeuralNetwork) trainBatch(trainer training.Trainer, epocs int) {
	for t := 0; t < epocs; t++ {
		if nn.watcher != nil {
			if nn.watcher.GetSubscriptionFeatures().Has(TrainingSubscription) {
				nn.watcher.UpdateTraining(t, epocs, 0, trainer.DataCount())
			}
		}
		batchWorkers := nn.runBatchWorkers(trainer)
		nn.syncMutex.Lock()
		for l := 1; l < nn.LayerCount; l++ {
			bGradient, ok := nn.BGradient[l].(BatchGradientDescent)
			if !ok {
				panic("bGradient is not a BatchGradientDescent")
			}
			wGradient, ok := nn.WGradient[l].(BatchGradientDescent)
			if !ok {
				panic("wGradient is not a BatchGradientDescent")
			}
			for _, bw := range batchWorkers {
				dB, dW := bw.Result(l)
				bGradient.AccumGradients(dB)
				wGradient.AccumGradients(dW)
			}
			nn.Biases[l] = bGradient.ApplyDelta(nn.Biases[l])
			nn.Weights[l] = wGradient.ApplyDelta(nn.Weights[l])
			if nn.watcher != nil {
				if nn.watcher.GetSubscriptionFeatures().Has(BiasesSubscription) {
					nn.watcher.UpdateBiases(l, mat.DenseCopyOf(nn.Biases[l]))
				}
				if nn.watcher.GetSubscriptionFeatures().Has(WeightsSubscription) {
					nn.watcher.UpdateWeights(l, mat.DenseCopyOf(nn.Weights[l]))
				}
			}
		}
		nn.syncMutex.Unlock()

		if nn.earlyStop != nil && nn.earlyStop.Test() {
			log.Printf("Training stopped due to fail rate grow\n")
			break
		}

		if nn.watcher.GetSubscriptionFeatures().Has(BiasesSubscription) || nn.watcher.GetSubscriptionFeatures().Has(WeightsSubscription) {
			time.Sleep(100 * time.Millisecond) //TODO: it's better to add 'Latency() int' method to watcher, for check above
		}
	}
}

func (nn *NeuralNetwork) runBatchWorkers(trainer training.Trainer) (workers []BatchWorker) {
	if nn.batchWorkerFactory == nil {
		nn.batchWorkerFactory = NewLocalBatchWorkerFactory(nn)
		log.Printf("Batch Worker factory is not set, using local one\n")
	}

	wg := sync.WaitGroup{}
	threadCount := nn.batchWorkerFactory.GetAvailableThreads()

	chunkSize := trainer.DataCount() / threadCount
	workers = make([]BatchWorker, threadCount)
	for i, _ := range workers {
		workers[i] = nn.batchWorkerFactory.GetBatchWorker()
		wg.Add(1)
		s := i
		go func() {
			workers[s].Run(trainer, s*chunkSize, (s+1)*chunkSize)
			wg.Done()
		}()
	}
	wg.Wait()
	return
}

// SaveState saves state of NeuralNetwork to io.Writer. It's usefull to keep training results
// between NeuralNetwork "power cycles" or to share traing results between clustered neural
// network hosts.
func (nn *NeuralNetwork) SaveState(writer io.Writer) {
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	//save input array count
	bufferSize := make([]byte, 4)
	binary.LittleEndian.PutUint32(bufferSize[0:], uint32(nn.LayerCount))
	_, err := writer.Write(bufferSize)

	check(err)
	//fmt.Printf("wrote value %d\n", uint32(nn.LayerCount))

	// save an input array
	buffer := make([]byte, nn.LayerCount*4)
	for i := 0; i < nn.LayerCount; i++ {
		binary.LittleEndian.PutUint32(buffer[i*4:], uint32(nn.Sizes[i]))
	}

	_, err = writer.Write(buffer)
	check(err)
	// fmt.Printf("wrote buffer %d bytes\n", n2)

	//save biases
	for i := 1; i < nn.LayerCount; i++ {
		saveDense(writer, nn.Biases[i])
	}

	//save weights
	for i := 1; i < nn.LayerCount; i++ {
		saveDense(writer, nn.Weights[i])
	}
}

// SaveStateToFile saves NeuralNetwork state to file by specific filePath.
func (nn *NeuralNetwork) SaveStateToFile(filePath string) {
	outFile, err := os.OpenFile(filePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666)
	check(err)
	defer outFile.Close()
	nn.SaveState(outFile)
}

// LoadState loads NeuralNetwork state from io.Reader. All existing data in NeuralNetwork
// will be rewritten buy this method, including layers configuration and weights and biases.
func (nn *NeuralNetwork) LoadState(reader io.Reader) {
	nn.syncMutex.Lock()
	defer nn.syncMutex.Unlock()
	// Read count
	nn.LayerCount = readInt(reader)

	// Read an input array
	sizeBuffer := readByteArray(reader, nn.LayerCount*4)
	nn.Sizes = make([]int, nn.LayerCount)

	for l := 0; l < nn.LayerCount; l++ {
		nn.Sizes[l] = int(binary.LittleEndian.Uint32(sizeBuffer[l*4:]))
		fmt.Printf("LoadState: nn.Sizes[%d] %d \n", l, nn.Sizes[l])
	}

	nn.Weights = []*mat.Dense{&mat.Dense{}}
	nn.Biases = []*mat.Dense{&mat.Dense{}}

	// read Biases
	nn.Biases[0] = &mat.Dense{}
	for l := 1; l < nn.LayerCount; l++ {
		nn.Biases = append(nn.Biases, &mat.Dense{})
		nn.Biases[l] = readDense(reader, nn.Biases[l])
	}

	// read Weights and initialize gradient descents
	nn.BGradient = make([]interface{}, nn.LayerCount)
	nn.WGradient = make([]interface{}, nn.LayerCount)
	nn.Weights[0] = &mat.Dense{}
	for l := 1; l < nn.LayerCount; l++ {
		nn.Weights = append(nn.Weights, &mat.Dense{})
		nn.Weights[l] = readDense(reader, nn.Weights[l])
		if nn.gradientDescentInitializer != nil {
			nn.BGradient[l] = nn.gradientDescentInitializer(nn, l, BiasGradient)
			nn.WGradient[l] = nn.gradientDescentInitializer(nn, l, WeightGradient)
		}
	}

	// fmt.Printf("\nLoadState end\n")
}

// LoadStateFromFile loads NeuralNetwork state from file by specific filePath.
func (nn *NeuralNetwork) LoadStateFromFile(filePath string) {
	inFile, err := os.Open(filePath)
	check(err)
	defer inFile.Close()
	nn.LoadState(inFile)
}

func (nn NeuralNetwork) forward(aIn mat.Matrix) (A, Z []*mat.Dense) {
	A = make([]*mat.Dense, nn.LayerCount)
	Z = make([]*mat.Dense, nn.LayerCount)

	A[0] = mat.DenseCopyOf(aIn)

	if nn.watcher != nil {
		if nn.watcher.GetSubscriptionFeatures().Has(ActivationsSubscription) {
			nn.watcher.UpdateActivations(0, mat.DenseCopyOf(A[0]))
		}
	}

	for l := 1; l < nn.LayerCount; l++ {
		A[l] = mat.NewDense(nn.Sizes[l], 1, nil)
		aSrc := A[l-1]
		aDst := A[l]

		// Each iteration implements formula bellow for neuron activation values
		// A[l]=σ(W[l]*A[l−1]+B[l])

		// W[l]*A[l−1]
		aDst.Mul(nn.Weights[l], aSrc)

		// W[l]*A[l−1]+B[l]
		aDst.Add(aDst, nn.Biases[l])

		// Save raw activation value for back propagation
		Z[l] = mat.DenseCopyOf(aDst)

		// σ(W[l]*A[l−1]+B[l])
		aDst.Apply(applySigmoid, aDst)
		if nn.watcher != nil {
			if nn.watcher.GetSubscriptionFeatures().Has(ActivationsSubscription) {
				nn.watcher.UpdateActivations(l, mat.DenseCopyOf(aDst))
			}
		}
	}
	return
}

// Function returns calculated bias and weights derivatives for each
// layer arround aIn/aOut datasets.
func (nn NeuralNetwork) backward(aIn, aOut mat.Matrix) (dB, dW []*mat.Dense) {
	A, Z := nn.forward(aIn)

	lastLayerNum := nn.LayerCount - 1
	dB = make([]*mat.Dense, nn.LayerCount)
	dW = make([]*mat.Dense, nn.LayerCount)

	// To calculate new values of weights and biases
	// following formulas are used:
	// ∂E/∂W[l] = A[l−1]*δ[l]
	// ∂E/∂B[l] = δ[l]

	// For last layer δ value is calculated by following:
	// δ = (A[L]−y)⊙σ'(Z[L])

	// Calculate initial error for last layer L
	// error = A[L]-y
	// Where y is expected activations set
	err := &mat.Dense{}
	err.Sub(A[nn.LayerCount-1], aOut)

	// Calculate sigmoids prime σ'(Z[L]) for last layer L
	sigmoidsPrime := &mat.Dense{}
	sigmoidsPrime.Apply(applySigmoidPrime, Z[lastLayerNum])

	// (A[L]−y)⊙σ'(Z[L])
	delta := &mat.Dense{}
	delta.MulElem(err, sigmoidsPrime)

	// ∂E/∂B[L] = δ[L]
	biases := mat.DenseCopyOf(delta)

	// ∂E/∂W[L] = A[L−1]*δ[L]
	weights := &mat.Dense{}
	weights.Mul(delta, A[lastLayerNum-1].T())

	// Initialize new weights and biases values with last layer values
	dB[lastLayerNum] = biases
	dW[lastLayerNum] = weights

	// Next layer derivatives of Weights and Biases are calculated using same formulas:
	// ∂E/∂W[l] = A[l−1]*δ[l]
	// ∂E/∂B[l] = δ[l]

	// But δ[l] is calculated using different formula:
	// δ[l] = ((Wt[l+1])*δ[l+1])⊙σ'(Z[l])
	// Where Wt[l+1] is transposed matrix of actual Weights from
	// forward step
	for l := nn.LayerCount - 2; l > 0; l-- {
		// Calculate sigmoids prime σ'(Z[l]) for last layer l
		sigmoidsPrime := &mat.Dense{}
		sigmoidsPrime.Apply(applySigmoidPrime, Z[l])

		// (Wt[l+1])*δ[l+1]
		// err bellow is delta from previous step(l+1)
		wdelta := &mat.Dense{}
		wdelta.Mul(nn.Weights[l+1].T(), delta)

		// Calculate new delta and store it to temporary variable err
		// δ[l] = ((Wt[l+1])*δ[l+1])⊙σ'(Z[l])
		delta = &mat.Dense{}
		delta.MulElem(wdelta, sigmoidsPrime)

		// ∂E/∂B[l] = δ[l]
		biases := mat.DenseCopyOf(delta)

		// ∂E/∂W[l] = A[l−1]*δ[l]
		// At this point it's required to give explanation for inaccuracy
		// in the formula

		// Multiplying of activations matrix for layer l-1 and δ[l] is imposible
		// because view of matrices are following:
		//          A[l-1]       δ[l]
		//         ⎡A[0]  ⎤     ⎡δ[0] ⎤
		//         ⎢A[1]  ⎥     ⎢δ[1] ⎥
		//         ⎢ ...  ⎥     ⎢ ... ⎥
		//         ⎢A[i]  ⎥  X  ⎢δ[i] ⎥
		//         ⎢ ...  ⎥     ⎢ ... ⎥
		//         ⎣A[s'] ⎦     ⎣δ[s] ⎦
		// So we need to modify these matrices to apply mutiplications and got
		// Weights matrix of following view:
		//         ⎡w[0,0] ... w[0,j] ... w[0,s']⎤
		//         ⎢w[1,0] ... w[1,j] ... w[1,s']⎥
		//         ⎢              ...            ⎥
		//         ⎢w[i,0] ... w[i,j] ... w[i,s']⎥
		//         ⎢              ...            ⎥
		//         ⎣w[s,0] ... w[s,j] ... w[s,s']⎦
		// So we swap matrices and transpose A[l-1] to get valid multiplication
		// of following view:
		//           δ[l]               A[l-1]
		//         ⎡δ[0] ⎤ x [A[0] A[1] ... A[i] ... A[s']]
		//         ⎢δ[1] ⎥
		//         ⎢ ... ⎥
		//         ⎢δ[i] ⎥
		//         ⎢ ... ⎥
		//         ⎣δ[s] ⎦
		weights := &mat.Dense{}
		weights.Mul(delta, A[l-1].T())

		dB[l] = biases
		dW[l] = weights
	}
	return
}