/**
 * ml - a neural network processor written with C
 * Copyright (C) 2023  jvech
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

#include <stdlib.h>
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <openblas/cblas.h>

#include "util.h"
#include "nn.h"


struct Cost load_loss(struct Configs cfg);
static void dataset_shuffle_rows(
        double *inputs, size_t in_shape[2],
        double *labels, size_t lbl_shape[2]);

static void fill_random_weights(double *weights, double *bias, size_t rows, size_t cols);

static double get_avg_loss(
        double labels[], double outs[], size_t shape[2],
        double (*loss)(double *, double *, size_t));


double square_loss(double labels[], double net_outs[], size_t shape);
double square_dloss_out(double labels, double net_out);

struct Cost NN_SQUARE = {
    .func = square_loss,
    .dfunc_out = square_dloss_out
};

void nn_network_predict(
        double *output, size_t output_shape[2],
        double *input, size_t input_shape[2],
        Layer network[], size_t network_size)
{
    double **outs = calloc(network_size, sizeof(double *));
    double **zouts = calloc(network_size, sizeof(double *));
    size_t samples = input_shape[0];
    for (size_t l = 0; l < network_size; l++) {
        outs[l] = calloc(samples * network[l].neurons, sizeof(double));
        zouts[l] = calloc(samples * network[l].neurons, sizeof(double));
    }

    nn_forward(outs, zouts, input, input_shape, network, network_size);
    memmove(output, outs[network_size - 1], samples * output_shape[1] * sizeof(double));

    for (size_t l = 0; l < network_size; l++) {
        free(outs[l]);
        free(zouts[l]);
    }
    free(outs);
    free(zouts);
}

void nn_network_train(
        Layer network[], struct Configs ml_configs,
        double *input, size_t input_shape[2],
        double *labels, size_t labels_shape[2])
{
    assert(input_shape[0] == labels_shape[0] && "label samples don't correspond with input samples\n");

    size_t epochs = ml_configs.epochs;
    size_t batch_size = ml_configs.batch_size;
    size_t network_size = ml_configs.network_size;
    double alpha = ml_configs.alpha;
    bool shuffle = ml_configs.shuffle;
    struct Cost cost = load_loss(ml_configs);

    double **outs = calloc(network_size, sizeof(double *));
    double **zouts = calloc(network_size, sizeof(double *));
    double **weights = calloc(network_size, sizeof(double *));
    double **biases = calloc(network_size, sizeof(double *));

    if (!outs || !zouts || !weights || !biases) goto nn_network_train_error;

    double *input_random = calloc(input_shape[0] * input_shape[1], sizeof(double));
    double *labels_random = calloc(labels_shape[0] * labels_shape[1], sizeof(double));

    if (!input_random || !labels_random) goto nn_network_train_error;

    memcpy(input_random, input, sizeof(double) * input_shape[0] * input_shape[1]);
    memcpy(labels_random, labels, sizeof(double) * labels_shape[0] * labels_shape[1]);


    size_t samples = input_shape[0];
    for (size_t l = 0; l < network_size; l++) {
        outs[l] = calloc(batch_size * network[l].neurons, sizeof(double));
        zouts[l] = calloc(batch_size * network[l].neurons, sizeof(double));
        weights[l] = malloc(network[l].input_nodes * network[l].neurons * sizeof(double));
        biases[l] = malloc(network[l].neurons * sizeof(double));

        if (!outs[l] || !zouts || !weights[l] || !biases) goto nn_network_train_error;


        memcpy(weights[l], network[l].weights, sizeof(double) * network[l].input_nodes * network[l].neurons);
        memcpy(biases[l], network[l].bias, sizeof(double) * network[l].neurons);
    }


    size_t batch_input_shape[2] = {batch_size, input_shape[1]};
    size_t batch_labels_shape[2] = {batch_size, labels_shape[1]};
    size_t n_batches = input_shape[0] / batch_size;
    if (samples % batch_size) {
        n_batches++;
    }
    for (size_t epoch = 0; epoch < epochs; epoch++) {

        if (shuffle) {
            dataset_shuffle_rows(input_random, input_shape, labels_random, labels_shape);
        }


        for (size_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
            size_t index = batch_size * batch_idx;

            double *input_batch = input_random + index * input_shape[1];
            double *labels_batch = labels_random + index * labels_shape[1];

            if (batch_idx == n_batches - 1 && samples % batch_size) {
                batch_input_shape[0] = samples % batch_size;
                batch_labels_shape[0] = samples % batch_size;
            }

            nn_forward(outs, zouts, input_batch, batch_input_shape, network, network_size);
            nn_backward(
                    weights, biases,
                    zouts, outs,
                    input_batch, batch_input_shape,
                    labels_batch, batch_labels_shape,
                    network, network_size,
                    cost.dfunc_out, alpha);
            double *net_out = outs[network_size - 1];
            fprintf(stdout, "epoch: %g \t loss: %6.6lf\n",
                    epoch + (float)batch_idx / n_batches,
                    get_avg_loss(labels, net_out, batch_labels_shape, cost.func));
        }
    }

    for (size_t l = 0; l < network_size; l++) {
        free(outs[l]);
        free(zouts[l]);
        free(weights[l]);
        free(biases[l]);
    }

    free(zouts);
    free(outs);
    free(weights);
    free(biases);

    return;
nn_network_train_error:
    perror("nn_network_train() Error");
    exit(1);
}

void nn_backward(
        double **weights, double **bias,
        double **Zout, double **Outs,
        double *Input, size_t input_shape[2],
        double *Labels, size_t labels_shape[2],
        Layer network[], size_t network_size,
        double (dcost_out_func)(double, double),
        double alpha)
{
    size_t max_neurons = 0;
    for (size_t l = 0; l < network_size; l++) {
        max_neurons = (max_neurons > network[l].neurons) ? max_neurons : network[l].neurons;
    }
    double *dcost_outs = calloc(labels_shape[0] * labels_shape[1], sizeof(double));
    double *delta = calloc(max_neurons, sizeof(double));
    double *delta_next = calloc(max_neurons, sizeof(double));

    if (!dcost_outs || !delta || !delta_next) goto nn_backward_error;

    for (size_t i = 0; i < labels_shape[0]; i++) {
        for (size_t j = 0; j < labels_shape[1]; j++) {
            size_t index = i * labels_shape[1] + j;
            dcost_outs[index] = dcost_out_func(Labels[index], Outs[network_size - 1][index]);
        }
    }

    for (size_t sample = 0; sample < input_shape[0]; sample++) {
        for (size_t l = network_size - 1; l < network_size; l--) {
            size_t weights_shape[2] = {network[l].input_nodes, network[l].neurons};
            if (l == network_size - 1) {
                double *zout = Zout[l] + sample * network[l].neurons;
                double *out_prev = Outs[l - 1] + sample * network[l-1].neurons;
                double *dcost_out = dcost_outs + sample * network[l].neurons;
                nn_layer_out_delta(delta, dcost_out, zout, network[l].neurons, network[l].activation.dfunc);
                nn_layer_backward(weights[l], bias[l], weights_shape, delta, out_prev, alpha);
            } else if (l == 0) {
                size_t weights_next_shape[2] = {network[l+1].input_nodes, network[l+1].neurons};
                double *zout = Zout[l] + sample * network[l].neurons;
                double *input = Input + sample * input_shape[1];
                nn_layer_hidden_delta(delta, delta_next, zout, weights[l+1], weights_next_shape, network[l].activation.dfunc);
                nn_layer_backward(weights[l], bias[l], weights_shape, delta, input, alpha);
            } else {
                size_t weights_next_shape[2] = {network[l+1].input_nodes, network[l+1].neurons};
                double *zout = Zout[l] + sample * network[l].neurons;
                double *out_prev = Outs[l - 1] + sample * network[l-1].neurons;
                nn_layer_hidden_delta(delta, delta_next, zout, weights[l+1], weights_next_shape, network[l].activation.dfunc);
                nn_layer_backward(weights[l], bias[l], weights_shape, delta, out_prev, alpha);
            }
            memmove(delta_next, delta, weights_shape[1] * sizeof(double));
        }

    }

    for (size_t l = 0; l < network_size; l++) {
        size_t weights_shape[2] = {network[l].input_nodes, network[l].neurons};
        memcpy(network[l].weights, weights[l], weights_shape[0] * weights_shape[1] * sizeof(double));
        memcpy(network[l].bias, bias[l], weights_shape[1] * sizeof(double));
    }

    free(dcost_outs);
    free(delta);
    free(delta_next);
    return;
nn_backward_error:
    perror("nn_backward() Error");
    exit(1);
}

void nn_layer_backward(
        double *weights, double *bias, size_t weights_shape[2],
        double *delta, double *out_prev,
        double alpha)
{
    // W_next = W - alpha * out_prev @ delta.T
    cblas_dger(CblasRowMajor, weights_shape[0], weights_shape[1], -alpha,
               out_prev, 1, delta, 1, weights, weights_shape[1]);

    for (size_t j = 0; j < weights_shape[1]; j++)
        bias[j] = bias[j] - alpha * delta[j];
}

void nn_layer_hidden_delta(
        double *delta, double *delta_next, double *zout,
        double *weights_next, size_t weights_shape[2],
        double (*activation_derivative)(double))
{
    for (size_t j = 0; j < weights_shape[0]; j++) {
        double sum = 0;
        for (size_t k = 0; k < weights_shape[1]; k++) {
            size_t index = j * weights_shape[1] + k;
            sum += delta_next[k] * weights_next[index];
        }
        delta[j] = sum * activation_derivative(zout[j]);
    }
}

void nn_layer_out_delta(
        double *delta, double *error, double *zout,
        size_t cols,
        double (*activation_derivative)(double))
{

    for (size_t i = 0; i < cols; i++) {
        delta[i] = error[i] * activation_derivative(zout[i]);
    }
}

void nn_forward(
        double **out, double **zout,
        double *X, size_t X_shape[2],
        Layer network[], size_t network_size)
{
    size_t in_shape[2] = {X_shape[0], X_shape[1]};
    size_t out_shape[2];
    out_shape[0] = X_shape[0];
    double *input = X;

    for (size_t l = 0; l < network_size; l++) {
        out_shape[1] = network[l].neurons;
        nn_layer_forward(network[l], zout[l], out_shape, input, in_shape);
        nn_layer_map_activation(network[l].activation.func, out[l], out_shape, zout[l], out_shape);
        in_shape[1] = out_shape[1];
        input = out[l];
    }
}

void nn_layer_map_activation(
        double (*activation)(double),
        double *aout, size_t aout_shape[2],
        double *zout, size_t zout_shape[2])
{
    if (zout_shape[0] != aout_shape[0] || zout_shape[1] != aout_shape[1]) {
        fprintf(stderr,
                "nn_layer_map_activation() Error: zout must have (%zu x %zu) dimensions not (%zu x %zu)\n",
                aout_shape[0], aout_shape[1], zout_shape[0], zout_shape[1]);
        exit(1);
    }

    for (size_t i = 0; i < aout_shape[0]; i++) {
        for (size_t j = 0; j < aout_shape[1]; j ++) {
            size_t index = aout_shape[1] * i + j;
            aout[index] = activation(zout[index]);
        }
    }
}

void nn_layer_forward(Layer layer, double *zout, size_t zout_shape[2], double *input, size_t input_shape[2])
{
    if (zout_shape[0] != input_shape[0] || zout_shape[1] != layer.neurons) {
        fprintf(stderr,
                "nn_layer_forward() Error: zout must have (%zu x %zu) dimensions not (%zu x %zu)\n",
                input_shape[0], layer.neurons, zout_shape[0], zout_shape[1]);
        exit(1);
    }

    for (size_t i = 0; i < input_shape[0]; i++) {
        for (size_t j = 0; j < layer.neurons; j++) {
            size_t index = layer.neurons * i + j;
            zout[index] = layer.bias[j];
        }
    }

    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                input_shape[0], layer.neurons, layer.input_nodes, // m, n, k
                1.0, input, input_shape[1], //alpha X
                layer.weights, layer.neurons, // W
                1.0, zout, layer.neurons); // beta B
}

void nn_network_read_weights(char *filepath, Layer *network, size_t network_size)
{
    FILE *fp = fopen(filepath, "rb");
    if (fp == NULL) die("nn_network_read_weights Error():");

    size_t net_size, shape[2], ret;
    ret = fread(&net_size, sizeof(size_t), 1, fp);
    if (net_size != network_size) goto nn_network_read_weights_error;

    for (size_t i = 0; i < network_size; i++) {
        fread(shape, sizeof(size_t), 2, fp);
        if (shape[0] != network[i].input_nodes
            || shape[1] != network[i].neurons) {
            goto nn_network_read_weights_error;
        }

        if (!network[i].weights || !network[i].bias) {
            die("nn_network_read_weights() Error: "
                "the weights on layer %zu haven't been initialized", i);
        }

        ret = fread(network[i].weights, sizeof(double), shape[0] * shape[1], fp);
        if (ret != shape[0] * shape[1]) goto nn_network_read_weights_error;

        ret = fread(network[i].bias, sizeof(double), shape[1], fp);
        if (ret != shape[1]) goto nn_network_read_weights_error;
    }

    fclose(fp);
    return;

nn_network_read_weights_error:
    fclose(fp);
    die("nn_network_read_weights() Error: "
        "number of read objects does not match with expected ones");
}

void nn_network_write_weights(char *filepath, Layer *network, size_t network_size)
{
    FILE *fp = fopen(filepath, "wb");
    if (fp == NULL) die("nn_network_write_weights() Error:");

    fwrite(&network_size, sizeof(size_t), 1, fp);

    size_t ret;
    for (size_t i = 0; i < network_size; i++) {
        size_t shape[2] = {network[i].input_nodes, network[i].neurons};
        size_t size = shape[0] * shape[1];

        ret = fwrite(shape, sizeof(size_t), 2, fp);
        if (ret != 2) goto nn_network_write_weights_error;

        ret = fwrite(network[i].weights, sizeof(double), size, fp);
        if (ret != size) goto nn_network_write_weights_error;

        ret = fwrite(network[i].bias, sizeof(double), network[i].neurons, fp);
        if (ret != network[i].neurons) goto nn_network_write_weights_error;
    }
    fclose(fp);
    return;

nn_network_write_weights_error:
    fclose(fp);
    die("nn_network_write_weights() Error: "
        "number of written objects does not match with number of objects");
}

void nn_network_init_weights(Layer layers[], size_t nmemb, size_t n_inputs, bool fill_random)
{
    size_t i, prev_size = n_inputs;


    for (i = 0; i < nmemb;  i++) {
        layers[i].weights = calloc(prev_size * layers[i].neurons, sizeof(double));
        layers[i].bias = calloc(layers[i].neurons, sizeof(double));

        if (layers[i].weights == NULL || layers[i].bias == NULL) {
            goto nn_layers_calloc_weights_error;
        }

        if (fill_random) fill_random_weights(layers[i].weights, layers[i].bias, prev_size, layers[i].neurons);

        layers[i].input_nodes = prev_size;
        prev_size = layers[i].neurons;
    }

    return;

nn_layers_calloc_weights_error:
    perror("nn_layers_calloc_weights() Error");
    exit(1);
}

void nn_network_free_weights(Layer layers[], size_t nmemb)
{
    size_t i;
    for (i = 0; i < nmemb; i++) {
        free(layers[i].weights);
        free(layers[i].bias);
    }
}

void fill_random_weights(double *weights, double *bias, size_t rows, size_t cols)
{
    FILE *fp = fopen("/dev/random", "rb");
    if (fp == NULL) goto nn_fill_random_weights_error;

    size_t weights_size = rows * cols;
    int64_t *random_weights = calloc(weights_size, sizeof(int64_t));
    int64_t *random_bias = calloc(cols, sizeof(int64_t));

    fread(random_weights, sizeof(int64_t), weights_size, fp);
    fread(random_bias, sizeof(int64_t), cols, fp);

    if (!random_weights || !random_bias) goto nn_fill_random_weights_error;

    for (size_t i = 0; i < weights_size; i++) {
        weights[i] = (double)random_weights[i] / (double)INT64_MAX * 2;
    }

    for (size_t i = 0; i < cols; i++) {
        bias[i] = (double)random_bias[i] / (double)INT64_MAX * 2;
    }

    free(random_weights);
    free(random_bias);
    fclose(fp);
    return;

nn_fill_random_weights_error:
    perror("nn_fill_random_weights Error()");
    exit(1);
}

void dataset_shuffle_rows(
        double *inputs, size_t in_shape[2],
        double *labels, size_t lbl_shape[2])
{
    size_t random_row;
    size_t in_index, lbl_index;
    size_t shuffle_in_index, column_in_bytes;
    size_t shuffle_lbl_index, column_lbl_bytes;
    double *in_buffer, *lbl_buffer;

    in_buffer = malloc(sizeof(double) * in_shape[1]);
    lbl_buffer = malloc(sizeof(double) * lbl_shape[1]);

    if (in_buffer == NULL || lbl_buffer == NULL)
        goto dataset_shuffle_rows_error;

    column_in_bytes = sizeof(double) * in_shape[1];
    column_lbl_bytes = sizeof(double) * lbl_shape[1];
    for (size_t row = 0; row < in_shape[0]; row++) {
        /* Swap actual row with a random row*/
        random_row = random() % in_shape[0];

        /* Input Swap */
        in_index = row * in_shape[1];
        shuffle_in_index = random_row * in_shape[1];
        memcpy(in_buffer, inputs + in_index, column_in_bytes);
        memcpy(inputs + in_index, inputs + shuffle_in_index, column_in_bytes);
        memcpy(inputs + shuffle_in_index, in_buffer, column_in_bytes);

        /* Label Swap */
        lbl_index = row * lbl_shape[1];
        shuffle_lbl_index = random_row * lbl_shape[1];
        memcpy(lbl_buffer, labels + lbl_index, column_lbl_bytes);
        memcpy(labels + lbl_index, labels + shuffle_lbl_index, column_lbl_bytes);
        memcpy(labels + shuffle_lbl_index, lbl_buffer, column_lbl_bytes);

    }

    free(in_buffer);
    free(lbl_buffer);
    return;

dataset_shuffle_rows_error:
    die("dataset_shuffle_rows() malloc Error:");
}

double square_loss(double labels[], double net_out[], size_t shape)
{
    double sum = 0;
    for (size_t i = 0; i < shape; i++) {
        sum += pow(labels[i] - net_out[i], 2);
    }
    return 0.5 * sum;
}

double square_dloss_out(double label, double net_out)
{
    return net_out - label;
}

double get_avg_loss(
        double labels[], double outs[], size_t shape[2],
        double (*loss)(double *, double *, size_t shape))
{
    double sum = 0;
    for (size_t i = 0; i < shape[0]; i += shape[1]) {
        sum += loss(labels + i, outs + i, shape[1]);
    }
    return sum / shape[0];
}

struct Cost load_loss(struct Configs cfg)
{
    if (!strcmp("square", cfg.loss)) return NN_SQUARE;
    die("load_loss() Error: Unknown '%s' loss function", cfg.loss);
    exit(1);
}

#ifdef NN_TEST
/*
 * compile: clang -Wall -Wextra -g -DNN_TEST -o objs/test_nn src/util.c src/nn.c $(pkg-config --libs-only-l blas) -lm
 */
int main(void) {
    /*
     * array_shuffle_rows() test
     */
    srandom(42);
    double input_array[12] = {
        11, 12, 13,
        21, 22, 23,
        31, 32, 33,
        41, 42, 43,
    };
    double shuffled_input_array[12] = {
        21, 22, 23,
        41, 42, 43,
        31, 32, 33,
        11, 12, 13,
    };
    size_t in_shape[2] = {4,3};

    double label_array[4] = {1, 2, 3, 4};
    double shuffled_label_array[4] = {2, 4, 3, 1};
    size_t lbl_shape[2] = {4,1};

    dataset_shuffle_rows(input_array, in_shape, label_array, lbl_shape);
    size_t i, j, index;
    for (i = 0; i < in_shape[0]; i++) {
        for (j = 0; j < in_shape[1]; j++) {
            index = i * in_shape[1] + j;
            if (input_array[index] != shuffled_input_array[index]) {
                printf("- array_shuffle_rows() failure: input_array mismatch on (%zu,%zu)\n", i, j);
                return 1;
            }
        }

        for (j = 0; j < lbl_shape[1]; j++) {
            index = i * lbl_shape[1] + j;
            if (label_array[index] != shuffled_label_array[index]) {
                printf("- array_shuffle_rows() failure: label_array mismatch on (%zu,%zu)\n", i, j);
                return 1;
            }
        }
    }
    printf("- array_shuffle_rows() success\n");

    return 0;
}
#endif //NN_TEST