Browse Source

Fix major issues in batch based traning mechanism

- Batch accumulation didn't took to account how many samples was
  processed by batch worker. Extend interface of batch worker and
  gradient descent to cover this
- Fix plus rprop intializer to create valid one
Alexey Edelev 4 years ago
parent
commit
74fce4149a

+ 1 - 1
neuralnetwork/gradient.go

@@ -45,6 +45,6 @@ type OnlineGradientDescent interface {
 // Batch gradient descent interface. Is used by batch training mechanism
 type BatchGradientDescent interface {
 	ApplyDelta(m mat.Matrix) *mat.Dense
-	AccumGradients(gradient mat.Matrix)
+	AccumGradients(gradient mat.Matrix, batchSize int)
 	Gradients() *mat.Dense
 }

+ 4 - 4
neuralnetwork/gradients/plusrpropgradient.go

@@ -45,9 +45,9 @@ type plusRPropGradient struct {
 func NewPlusRPropInitializer(config RPropConfig) neuralnetwork.GradientDescentInitializer {
 	return func(nn *neuralnetwork.NeuralNetwork, layer, gradientType int) interface{} {
 		if gradientType == neuralnetwork.BiasGradient {
-			return newRPropGradient(nn.Sizes[layer], 1, config)
+			return newPlusRPropGradient(nn.Sizes[layer], 1, config)
 		}
-		return newRPropGradient(nn.Sizes[layer], nn.Sizes[layer-1], config)
+		return newPlusRPropGradient(nn.Sizes[layer], nn.Sizes[layer-1], config)
 	}
 }
 
@@ -110,12 +110,12 @@ func (g *plusRPropGradient) ApplyDelta(m mat.Matrix) (result *mat.Dense) {
 	return result
 }
 
-func (g *plusRPropGradient) AccumGradients(gradient mat.Matrix) {
+func (g *plusRPropGradient) AccumGradients(gradient mat.Matrix, batchSize int) {
 	g.gradients.Apply(func(i, j int, v float64) float64 {
 		v += gradient.At(i, j)
 		return v
 	}, g.gradients)
-	g.batchSize++
+	g.batchSize += batchSize
 }
 
 func (g plusRPropGradient) Gradients() *mat.Dense {

+ 2 - 2
neuralnetwork/gradients/rpropgradient.go

@@ -117,12 +117,12 @@ func (g *rPropGradient) ApplyDelta(m mat.Matrix) (result *mat.Dense) {
 	return result
 }
 
-func (g *rPropGradient) AccumGradients(gradient mat.Matrix) {
+func (g *rPropGradient) AccumGradients(gradient mat.Matrix, batchSize int) {
 	g.gradients.Apply(func(i, j int, v float64) float64 {
 		v += gradient.At(i, j)
 		return v
 	}, g.gradients)
-	g.batchSize++
+	g.batchSize += batchSize
 }
 
 func (g rPropGradient) Gradients() *mat.Dense {

+ 1 - 1
neuralnetwork/interface.go

@@ -78,7 +78,7 @@ func (f *SubscriptionFeatures) Clear() {
 
 type BatchWorker interface {
 	Run(trainer training.Trainer, startIndex, endIndex int)
-	Result(layer int) (dB, dW *mat.Dense)
+	Result(layer int) (dB, dW *mat.Dense, batchSize int)
 }
 
 type BatchWorkerFactory interface {

+ 6 - 4
neuralnetwork/localbatchworker.go

@@ -66,17 +66,19 @@ func newLocalBatchWorker(nn *NeuralNetwork) (bw *localBatchWorker) {
 }
 
 func (bw *localBatchWorker) Run(trainer training.Trainer, startIndex, endIndex int) {
+	bw.batchSize = 0
 	for i := startIndex; i < endIndex; i++ {
+		bw.batchSize++
 		dB, dW := bw.network.backward(trainer.GetData(i))
 		for l := 1; l < bw.network.LayerCount; l++ {
-			bw.BGradient[l].AccumGradients(dB[l])
-			bw.WGradient[l].AccumGradients(dW[l])
+			bw.BGradient[l].AccumGradients(dB[l], 1)
+			bw.WGradient[l].AccumGradients(dW[l], 1)
 		}
 	}
 }
 
-func (bw *localBatchWorker) Result(layer int) (dB, dW *mat.Dense) {
-	return bw.BGradient[layer].Gradients(), bw.WGradient[layer].Gradients()
+func (bw *localBatchWorker) Result(layer int) (dB, dW *mat.Dense, batchSize int) {
+	return bw.BGradient[layer].Gradients(), bw.WGradient[layer].Gradients(), bw.batchSize
 }
 
 func (lbwf localBatchWorkerFactory) GetBatchWorker() BatchWorker {

+ 3 - 3
neuralnetwork/neuralnetwork.go

@@ -420,9 +420,9 @@ func (nn *NeuralNetwork) trainBatch(trainer training.Trainer, epocs int) {
 				panic("wGradient is not a BatchGradientDescent")
 			}
 			for _, bw := range batchWorkers {
-				dB, dW := bw.Result(l)
-				bGradient.AccumGradients(dB)
-				wGradient.AccumGradients(dW)
+				dB, dW, size := bw.Result(l)
+				bGradient.AccumGradients(dB, size)
+				wGradient.AccumGradients(dW, size)
 			}
 			nn.Biases[l] = bGradient.ApplyDelta(nn.Biases[l])
 			nn.Weights[l] = wGradient.ApplyDelta(nn.Weights[l])