function [Bavg,outputs]=ssgd(dataTrain, dataTest, labelTrain, labelTest, lambda, params)

% This function is slightly inspired from the one of the solver by Lacoste-Julien, Jaggi et al. (2013)
% In that function the training data are multivariate time series, whereas the labels are given as a T by T rescaled equivalence matrice.
%Details for the params argument:
% -recordScheme is the times at which you monitor the optimization objective
% convergence.
% - projection is set to 1 if you want a true metric (B >= 0). 
% - rescaled tells if the loss is normalized by 1/T or not (in our experiments the rescaled version performs consistently better)
% - scale is the parameter used for the stepsize in the stochastic
% subgradient descent.
% Note that scale has its importance, in practice it is better to try
% several different and to adjust it using the standard Bottou's tricks.
% Note also that a stepsize of the type 1/(1 + scale * t) ^ 0.75 is also
% known to work well.

recordScheme = params.recordScheme;


nTrain      = length(dataTrain);
nTest       = length(dataTest);
nFeatures   = size(dataTrain{1});
k           = 1;
recordIter  = recordScheme(k);

t              = 1;
B              = 0*eye(nFeatures(2));
Bavg           = B;
kMax           = length(recordScheme);
testError      = zeros(1, kMax);
trainError     = zeros(1, kMax);
objective      = zeros(1, kMax);
testErrorAvg   = zeros(1, kMax);
trainErrorAvg  = zeros(1, kMax);
objectiveAvg   = zeros(1, kMax);

while t<params.budget
    
    stepsize = params.scale/(params.scale + lambda * t);
    %Randomly draw a training instance
    
    i = floor(nTrain*(rand))+1;
    exampleData  = full(dataTrain{i});
    
    exampleLabel = full(labelTrain{i});
    
    exampleLabel = exampleLabel*pinv(exampleLabel'*exampleLabel)*exampleLabel';
    
    T = size(exampleLabel);
    T = T(1);
    
    %Matrix for loss augmented inference
    
    if ~params.rescaled
        
        A = (exampleData*B*exampleData'-2*exampleLabel);
        
    else
        
        A = (exampleData*B*exampleData' - eye(T) + (1/T) * eye(T) - 2*exampleLabel/T);
        
    end
    
    
    Nopt = changePointDetection(A);
    
    %Let's compute a subgradient
    
    sub = lambda * B + exampleData'*(Nopt-exampleLabel)*exampleData;
    
    %Now we make a descent in the opposite of the subgradient using the
    %Pegasos stepsize, note the different versions of averaging we propose.
    
    B = B - stepsize * sub;    
    
    if params.projection
        B = projectionSDP(B);
    else
    end
    
    
    if params.avg == 0
        
        Bavg = B;
        
    elseif params.avg == 1

        
        Bavg = (1-1/t)*Bavg+B/t;
        
    else
        
        %weighted averaged version
       
        Bavg = (1-2/(t+1))*Bavg+2*B/(t+1);
        
    end
    

    
    if t == recordIter
        
        testErrorAvg(k)  = computeError(Bavg, labelTest, dataTest);
        trainErrorAvg(k) = computeError(Bavg,  labelTrain, dataTrain);
        objectiveAvg(k)  = computeObjective(Bavg, lambda, labelTrain, dataTrain, params);
        testError(k)     = computeError(B, labelTest, dataTest);
        trainError(k)    = computeError(B,  labelTrain, dataTrain);
        objective(k)     = computeObjective(B, lambda, labelTrain, dataTrain, params);
        recordIter       = recordScheme(min(k+1, kMax));        
        fprintf('Iteration %d, test error %5.4e, train error %5.4e, objective %5.4e \n', t, testError(k), trainError(k), objective(k))

        k = min(k+1, kMax);
        
    end
    
    t=t+1;
    
end

outputs.testError  = testError;
outputs.objective  = objective;
outputs.trainError = trainError;

end