SeedRandom[1234];
muGBM = 0;
sigmaGBM = 0.1;
X0 = 10;
generateAllSamplesForOnePath[pathValues_List, pathTimes_List] :=
Module[{n =
Length[pathValues]}, {{pathValues[[#[[1]]]],
pathTimes[[#[[2]]]] - pathTimes[[#[[1]]]]},
pathValues[[#[[2]]]]} & /@
Flatten[Table[{i, j}, {i, 1, n}, {j, i + 1, n}], 1]];
samples =
Flatten[generateAllSamplesForOnePath[#["Values"], #["Times"]] & /@
Table[RandomFunction[
GeometricBrownianMotionProcess[muGBM, sigmaGBM, X0], {0,
10, .01}], {i, 1, 10}], 1];
splitRatio = 0.8;
(*Randomly shuffle the dataset*)
shuffledData = RandomSample[samples];
(*Split inputs and targets*)
(*numTrainingSamples=Round[splitRatio*Length[shuffledData]];*)
numTrainingSamples = Length[samples] - 100000;
inputs =
shuffledData[[1 ;; numTrainingSamples, 1]]; (*2D inputs:{x1,x2}*)
targets =
shuffledData[[1 ;; numTrainingSamples, 2]]; (*Targets:scalar values*)
validInputs = shuffledData[[numTrainingSamples + 1 ;;, 1]];
validTargets = shuffledData[[numTrainingSamples + 1 ;;, 2]];
trainMDNADV[inputDim_, numComponents_, nEpoch_] :=
Module[{basicNN, mdnNet, mdnNetWithLoss},
(*Define the number of mixture components*)
(*numComponents=5;*)
(*ndim=2;*)
basicNN = NetGraph[{
LinearLayer[50],(*1*)
Ramp,
LinearLayer[50],
Ramp,
LinearLayer[50],(*5*)
ThreadingLayer[Plus],(*7*)
LinearLayer[3 numComponents]}, {NetPort["Input"] ->
1 -> 2 -> 3 -> 4 -> 6, NetPort["Input"] -> 5 -> 6, 6 -> 7},
"Input" -> 2];
(*basicNN=NetGraph[{
LinearLayer[50],(*1*)
Ramp,
LinearLayer[50],
Ramp,
LinearLayer[3 numComponents]},{NetPort["Input"]->1->2->3->4->5},
"Input"->2];*)
mdnNet = NetGraph[{
basicNN,
PartLayer[1 ;; numComponents],
SoftmaxLayer[],
PartLayer[numComponents + 1 ;; 2 numComponents],
PartLayer[2 numComponents + 1 ;; 3 numComponents],
ElementwiseLayer[Exp] }, {1 -> 2 -> 3 -> NetPort["Alpha"],
1 -> 4 -> NetPort["Mu"], 1 -> 5 -> 6 -> NetPort["Sigma"]},
"Input" -> inputDim];
lossLayer =
FunctionLayer[
Function[
Block[{}, -Log[
Total[#Alpha* (1/((6.283185307179586`)^(1/2)*#Sigma) *
Exp[-(( #Output - #Mu )^2/(2 (#Sigma) ^2))])]]]]];
mdnNetWithLoss =
NetGraph[{mdnNet,
lossLayer}, {{NetPort[{1, "Alpha"}], NetPort[{1, "Sigma"}],
NetPort["Output"], NetPort[{1, "Mu"}]} -> 2 -> NetPort["Loss"]}];
NetTrain[mdnNetWithLoss, <|"Input" -> inputs, "Output" -> targets|>,
LossFunction -> "Loss",
ValidationSet -> <|"Input" -> validInputs,
"Output" -> validTargets|>,
BatchSize -> 1000, MaxTrainingRounds -> nEpoch,
LearningRate -> 0.00001,
Method -> {"ADAM", "GradientClipping" -> 10}]];
trainedNet1 = trainMDNADV[2, 5, 10];
Above code will generate error at batch 2:
NetTrain::arrdiv: Training was stopped early because one or more trainable parameters of the net diverged. The net with the lowest validation loss will be returned. To avoid divergence, ensure that the training data has been normalized to have zero mean and unit variance. You can also try specifying a lower learning rate, or use a different optimization method; the (possibly automatic) values used were Method->{ADAM,Beta1->0.9,Beta2->0.999,Epsilon->1/100000,GradientClipping->10.,L2Regularization->None,LearningRate->0.00001,LearningRateSchedule->None,WeightClipping->None}, LearningRate->0.00001. Alternatively, you can use the "GradientClipping" option to Method to bound the magnitude of gradients during training.
I write a similar neural network in pytorch and works fine. I thought the problem may due to ThreadingLayer as after remove it then no problem. I also inspect the Weights of trainedNet by:
trainedMDNNet1 = NetExtract[trainedNet1, {1, 1}];
NetExtract[trainedMDNNet1, {1, "Weights"}]
trainedMDNNet1[inputs[[3]]]
And find paraters are all 0 seems very strange.