From ce0001538820d819bf965a24ffbb6f6e6269859c Mon Sep 17 00:00:00 2001
From: jvech <jmvalenciae@unal.edu.co>
Date: Fri, 26 Jul 2024 09:47:52 -0500
Subject: add: file_write() implemented

things implemented:
    - CLI option --only-out added
    - CLI option --format works more intuitively
    - csv tsv and json output support
---
 src/main.c  |  61 ++++++-----------
 src/parse.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 src/parse.h |  10 +++
 src/util.c  |  14 +++-
 src/util.h  |   2 +
 5 files changed, 227 insertions(+), 78 deletions(-)

(limited to 'src')

diff --git a/src/main.c b/src/main.c
index e692756..216d8d4 100644
--- a/src/main.c
+++ b/src/main.c
@@ -34,45 +34,6 @@
 #define ARRAY_SIZE(x, type) sizeof(x) / sizeof(type)
 
 
-static void json_write(
-        const char *filepath,
-        Array input, Array out,
-        char *out_keys[], size_t out_keys_size,
-        char *in_keys[], size_t in_keys_size);
-
-void json_write(
-        const char *filepath,
-        Array input, Array out,
-        char *out_keys[], size_t out_keys_size,
-        char *in_keys[], size_t in_keys_size)
-{
-    FILE *fp = (!filepath) ? fopen("/dev/stdout", "w") : fopen(filepath, "w");
-    if (!fp) die("json_read() Error:");
-    fprintf(fp, "[\n");
-
-    for (size_t i = 0; i < input.shape[0]; i++) {
-        fprintf(fp, "  {\n");
-
-        for (size_t j = 0; j < input.shape[1]; j++) {
-            size_t index = input.shape[1] * i + j;
-            fprintf(fp, "    \"%s\": %lf,\n", in_keys[j], input.data[index]);
-        }
-
-        for (size_t j = 0; j < out.shape[1]; j++) {
-            size_t index = out.shape[1] * i + j;
-            fprintf(fp, "    \"%s\": %lf", out_keys[j], out.data[index]);
-
-            if (j == out.shape[1] - 1) fprintf(fp, "\n");
-            else fprintf(fp, ",\n");
-        }
-
-        if (i == input.shape[0] - 1) fprintf(fp, "  }\n");
-        else fprintf(fp, "  },\n");
-    }
-    fprintf(fp, "]\n");
-    fclose(fp);
-}
-
 void load_config(struct Configs *cfg, int n_args, ...)
 {
     char *filepath;
@@ -133,6 +94,7 @@ int main(int argc, char *argv[]) {
         .alpha = 1e-5,
         .config_filepath = "utils/settings.cfg",
         .network_size = 0,
+        .only_out = false,
         .file_format = NULL,
         .out_filepath = NULL,
     };
@@ -153,7 +115,10 @@ int main(int argc, char *argv[]) {
 
     Array X, y;
     if (!strcmp("train", argv[0])) {
-        file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, true, ml_configs.file_format);
+        file_read(argv[1], &X, &y,
+                  ml_configs.input_keys, ml_configs.n_input_keys,
+                  ml_configs.label_keys, ml_configs.n_label_keys,
+                  true, ml_configs.file_format);
         nn_network_init_weights(network, ml_configs.network_size, X.shape[1], true);
         nn_network_train(
                 network, ml_configs.network_size,
@@ -165,11 +130,23 @@ int main(int argc, char *argv[]) {
         nn_network_write_weights(ml_configs.weights_filepath, network, ml_configs.network_size);
         fprintf(stderr, "weights saved on '%s'\n", ml_configs.weights_filepath);
     } else if (!strcmp("predict", argv[0])) {
-        file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, false, ml_configs.file_format);
+        file_read(argv[1], &X, &y,
+                  ml_configs.input_keys, ml_configs.n_input_keys,
+                  ml_configs.label_keys, ml_configs.n_label_keys,
+                  false, ml_configs.file_format);
         nn_network_init_weights(network, ml_configs.network_size, X.shape[1], false);
         nn_network_read_weights(ml_configs.weights_filepath, network, ml_configs.network_size);
         nn_network_predict(y.data, y.shape, X.data, X.shape, network, ml_configs.network_size);
-        json_write(ml_configs.out_filepath, X, y, ml_configs.label_keys, ml_configs.n_label_keys, ml_configs.input_keys, ml_configs.n_input_keys);
+
+        // If neither output and file_format defined use input to define the format
+        if (!ml_configs.file_format && !ml_configs.out_filepath) {
+            ml_configs.file_format = file_format_infer(ml_configs.in_filepath);
+        }
+
+        file_write(ml_configs.out_filepath, X, y,
+                   ml_configs.input_keys, ml_configs.n_input_keys,
+                   ml_configs.label_keys, ml_configs.n_label_keys,
+                   !ml_configs.only_out, ml_configs.file_format);
     } else usage(1);
 
     nn_network_free_weights(network, ml_configs.network_size);
diff --git a/src/parse.c b/src/parse.c
index 18668ec..c9b17ca 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -1,3 +1,21 @@
+/**
+ * ml - a neural network processor written with C
+ * Copyright (C) 2023  jvech
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
 #include <stdio.h>
 #include <stdbool.h>
 #include <string.h>
@@ -26,6 +44,21 @@ static void csv_read(
         char separator
         );
 
+static void json_write(
+        FILE *fp,
+        Array input, Array out,
+        char *in_keys[], size_t in_keys_size,
+        char *out_keys[], size_t out_keys_size,
+        bool write_input
+        );
+
+static void csv_write(
+        FILE *fp,
+        Array input, Array out,
+        bool write_input,
+        char separator
+        );
+
 static void csv_columns_select(
         double *dst_row, double *src_row,
         size_t selected_cols[], size_t cols_size,
@@ -38,6 +71,7 @@ static void csv_readline_values(
 
 static void csv_keys2cols(size_t cols[], char *keys[], size_t keys_size);
 
+
 void file_read(
         char *filepath,
         Array *input, Array *out,
@@ -47,22 +81,19 @@ void file_read(
         char *file_format)
 {
     FILE *fp;
-    char *ptr;
-    int i, string_length;
 
-    fp = (!strcmp(filepath, "-")) ? fopen("/dev/stdin", "r") : fopen(filepath, "r");
-    if (fp == NULL) die("file_read() Error:");
-
-    if (file_format == NULL && !strcmp(filepath, "-")) {
-        die("file_read() Error: on standard input the format must be defined");
+    if (filepath != NULL && strcmp(filepath, "-")) {
+        fp = fopen(filepath, "r");
+        file_format = file_format_infer(filepath);
+    } else {
+        fp = fopen("/dev/stdin", "r");
+        if (file_format == NULL) {
+            die("file_read() Error: file format must be defined");
+        }
     }
 
     if (file_format == NULL) {
-        string_length = strlen(filepath);
-        ptr = filepath + string_length;
-        for (i = string_length; i > 0 && *ptr != '.'; ptr--, i--);
-        if (*ptr != '.' || i == 0) die("file_read() Error: unable to infer %s format", filepath);
-        file_format = ptr + 1;
+        file_format = file_format_infer(filepath);
     }
 
     if (!strcmp(file_format, "csv"))        csv_read(fp, input, out, in_keys, n_in_keys, out_keys, n_out_keys, read_output, false, ',');
@@ -75,6 +106,38 @@ void file_read(
     fclose(fp);
 }
 
+void file_write(
+        char *filepath,
+        Array input, Array out,
+        char *in_keys[], size_t n_in_keys,
+        char *out_keys[], size_t n_out_keys,
+        bool write_input,
+        char *file_format)
+{
+    FILE *fp;
+
+
+    if (filepath != NULL && strcmp(filepath, "-")) {
+        fp = fopen(filepath, "w");
+        file_format = file_format_infer(filepath);
+    } else {
+        fp = fopen("/dev/stdout", "w");
+        if (file_format == NULL) {
+            die("file_write() Error: file format must be defined");
+        }
+    }
+
+    if (fp == NULL) die("file_write() Error:");
+
+    if (!strcmp(file_format, "json"))       json_write(fp, input, out, in_keys, n_in_keys, out_keys, n_out_keys, write_input);
+    else if (!strcmp(file_format, "csv"))   csv_write(fp, input, out, write_input, ',');
+    else if (!strcmp(file_format, "tsv"))   csv_write(fp, input, out, write_input, '\t');
+    else {
+        die("file_write() Error: unable to write %s files", file_format);
+    }
+    fclose(fp);
+}
+
 void json_read(
         FILE *fp,
         Array *input, Array *out,
@@ -83,11 +146,12 @@ void json_read(
         bool read_output)
 {
     static char fp_buffer[MAX_FILE_SIZE];
+    size_t i, j, json_obj_length, index;
 
 
     if (fp == NULL) goto json_read_error;
 
-    size_t i = 0;
+    i = 0;
     do {
         if (i >= MAX_FILE_SIZE) die("json_read() Error: file size is bigger than '%zu'", i, MAX_FILE_SIZE);
         fp_buffer[i] = fgetc(fp);
@@ -95,7 +159,7 @@ void json_read(
 
     json_object *json_obj;
     json_obj = json_tokener_parse(fp_buffer);
-    size_t json_obj_length = json_object_array_length(json_obj);
+    json_obj_length = json_object_array_length(json_obj);
 
     input->shape[0] = (size_t)json_obj_length;
     input->shape[1] = n_input_keys;
@@ -107,18 +171,18 @@ void json_read(
 
     if (!input->data || !out->data) goto json_read_error;
 
-    for (int i = 0; i < json_object_array_length(json_obj); i++) {
+    for (i = 0; i < json_object_array_length(json_obj); i++) {
         json_object *item = json_object_array_get_idx(json_obj, i);
 
-        for (int j = 0; j < n_input_keys; j++) {
-            size_t index = n_input_keys * i + j;
+        for (j = 0; j < n_input_keys; j++) {
+            index = n_input_keys * i + j;
             input->data[index] = json_object_get_double(json_object_object_get(item, in_keys[j]));
         }
 
         if (!read_output) continue;
 
-        for (int j = 0; j < n_out_keys; j++) {
-            size_t index =  n_out_keys * i + j;
+        for (j = 0; j < n_out_keys; j++) {
+            index =  n_out_keys * i + j;
             out->data[index] = json_object_get_double(json_object_object_get(item, out_keys[j]));
         }
     }
@@ -197,6 +261,74 @@ void csv_read(
     return;
 }
 
+void json_write(
+        FILE *fp,
+        Array input, Array out,
+        char *in_keys[], size_t in_keys_size,
+        char *out_keys[], size_t out_keys_size,
+        bool write_input)
+{
+    fprintf(fp, "[\n");
+
+    if (in_keys_size != input.shape[1] && write_input) {
+        die("json_write() Error: there are more keys (%zu) than input columns (%zu)",
+            in_keys_size, input.shape[1]);
+    }
+
+    if (out_keys_size != out.shape[1]) {
+        die("json_write() Error: there are more keys (%zu) than output columns (%zu)",
+            out_keys_size, out.shape[1]);
+    }
+
+    for (size_t i = 0; i < input.shape[0]; i++) {
+        fprintf(fp, "  {\n");
+
+        if (write_input) {
+            for (size_t j = 0; j < input.shape[1]; j++) {
+                size_t index = input.shape[1] * i + j;
+                fprintf(fp, "    \"%s\": %lf,\n", in_keys[j], input.data[index]);
+            }
+        }
+
+        for (size_t j = 0; j < out.shape[1]; j++) {
+            size_t index = out.shape[1] * i + j;
+            fprintf(fp, "    \"%s\": %lf", out_keys[j], out.data[index]);
+
+            if (j == out.shape[1] - 1) fprintf(fp, "\n");
+            else fprintf(fp, ",\n");
+        }
+
+        if (i == input.shape[0] - 1) fprintf(fp, "  }\n");
+        else fprintf(fp, "  },\n");
+    }
+    fprintf(fp, "]\n");
+}
+
+void csv_write(
+        FILE *fp,
+        Array input, Array out,
+        bool write_input,
+        char separator
+        )
+{
+    size_t line, col, index;
+    for (line = 0; line < input.shape[0]; line++) {
+        if (write_input) {
+            for (col = 0; col < input.shape[1]; col++) {
+                index = input.shape[1] * line + col;
+                fprintf(fp, "%lf%c", input.data[index], separator);
+            }
+        }
+        for (col = 0; col < out.shape[1]; col++) {
+            index = out.shape[1] * line + col;
+            fprintf(fp, "%lf", out.data[index]);
+            if (col == out.shape[1] - 1) continue;
+            fprintf(fp, "%c", separator);
+        }
+        fprintf(fp, "\n");
+    }
+}
+
 void csv_columns_select(
         double *dst_row, double *src_row,
         size_t selected_cols[], size_t cols_size,
@@ -265,11 +397,27 @@ void csv_keys2cols(size_t cols[], char *keys[], size_t keys_size)
     }
 }
 
+char * file_format_infer(char *filename)
+{
+    char *file_format, *ptr;
+    size_t string_length, i;
+
+    string_length = strlen(filename);
+    ptr = filename + string_length;
+    for (i = string_length; i > 0 && *ptr != '.'; ptr--, i--);
+    if (*ptr != '.' || i == 0) {
+        die("file_format_infer() Error: unable to infer %s format", filename);
+    }
+    file_format = ptr + 1;
+    return file_format;
+}
+
+
 #ifdef PARSE_TEST
 #include <assert.h>
 #include <string.h>
 /*
- * compile: clang -Wall -g -DPARSE_TEST -o objs/test_parse src/util.c src/parse.c $(pkg-config --libs-only-l json-c)
+ * compile: clang -Wall -Wextra -g -DPARSE_TEST -o objs/test_parse src/util.c src/parse.c $(pkg-config --libs-only-l json-c)
  */
 size_t parse_keys(char *keys[], char *argv, char key_buffer[512])
 {
@@ -291,47 +439,51 @@ size_t parse_keys(char *keys[], char *argv, char key_buffer[512])
 }
 
 int main(int argc, char *argv[]) {
-    char *filename, *format;
+    char *in_file, *out_file, *format;
     size_t i, j;
 
-    if (argc < 4 || argc > 5) {
+    if (argc != 5 && argc != 6) {
         fprintf(stderr,
-                "Usage: parse_test FILENAME IN_KEYS OUT_KEYS [FORMAT]\n"
+                "Usage: parse_test IN_FILE OUT_FILE IN_KEYS OUT_KEYS [FORMAT]\n"
                 "\nKeys format:\n"
                 "  IN_KEYS: in_key1, in_key2, ...\n"
                 "  OUT_KEYS: out_key1, out_key2, ...\n\n");
         return 1;
     }
 
-    filename = argv[1];
+    in_file = argv[1];
+    out_file = argv[2];
     format = NULL;
-    if (argc == 5) {
-        format = argv[4];
+    if (argc == 6) {
+        format = argv[5];
     }
 
     Array X, y;
     char *in_cols[32], *out_cols[32], keys_buffer[512];
     size_t n_in_cols, n_out_cols;
 
-    n_in_cols = parse_keys(in_cols, argv[2], keys_buffer);
-    n_out_cols = parse_keys(out_cols, argv[3], keys_buffer);
+    n_in_cols = parse_keys(in_cols, argv[3], keys_buffer);
+    n_out_cols = parse_keys(out_cols, argv[4], keys_buffer);
 
-    file_read(filename, &X, &y, in_cols, 2, out_cols, 1, true, format);
+    file_read(in_file, &X, &y, in_cols, n_in_cols, out_cols, n_out_cols, true, format);
 
     for (i = 0; i < X.shape[0]; i++) {
         for (j = 0; j < X.shape[1]; j++) {
-            printf("%*.2e\t", 4, X.data[i * X.shape[1] + j]);
+            fprintf(stderr, "%.2e\t", X.data[i * X.shape[1] + j]);
         }
 
         for (j = 0; j < y.shape[1]; j++) {
-            if (j == 0) printf("|\t");
-            printf("%5.2e", y.data[i * y.shape[1] + j]);;
+            if (j == 0) fprintf(stderr, "|\t");
+            fprintf(stderr, "%.2e", y.data[i * y.shape[1] + j]);;
             if (j < y.shape[1] - 1) printf("\t");
         }
-        printf("\n");
+        fprintf(stderr, "\n");
 
     }
 
+    // use input format if format variable is not defined
+    format = (!format && !strcmp(out_file, "-")) ? file_format_infer(in_file) : format;
+    file_write(out_file, X, y, in_cols, n_in_cols, out_cols, n_out_cols, true, format);
     for (i = 0; i < n_in_cols; i++) free(in_cols[i]);
     for (i = 0; i < n_out_cols; i++) free(out_cols[i]);
 
diff --git a/src/parse.h b/src/parse.h
index d5f99d0..d8aeada 100644
--- a/src/parse.h
+++ b/src/parse.h
@@ -18,4 +18,14 @@ void file_read(
         bool read_output,
         char *file_format
         );
+
+void file_write(
+        char *filepath,
+        Array input, Array out,
+        char *in_keys[], size_t n_in_keys,
+        char *out_keys[], size_t n_out_keys,
+        bool write_input,
+        char *file_format);
+
+char * file_format_infer(char *filename);
 #endif
diff --git a/src/util.c b/src/util.c
index 8a7924f..8fa8a87 100644
--- a/src/util.c
+++ b/src/util.c
@@ -92,14 +92,15 @@ void usage(int exit_code)
     FILE *fp = (!exit_code) ? stdout : stderr;
     fprintf(fp,
             "Usage: ml train [Options] FILE\n"
-            "   or: ml predict [-o FILE] FILE\n"
+            "   or: ml predict [-Ohv] [-f FORMAT] [-o FILE] FILE\n"
             "\n"
             "Options:\n"
             "  -h, --help               Show this message\n"
-            "  -f, --format=FORMAT      File input and/or output format\n"
+            "  -f, --format=FORMAT      Define input or output FILE format if needed\n"
             "  -a, --alpha=ALPHA        Learning rate (only works with train)\n"
             "  -e, --epochs=EPOCHS      Epochs to train the model (only works with train)\n"
             "  -o, --output=FILE        Output file (only works with predict)\n"
+            "  -O, --only-out           Don't show input fields (only works with predict)\n"
             "  -c, --config=FILE        Configuration filepath [default=~/.config/ml/ml.cfg]\n"
             "\n"
            );
@@ -117,12 +118,13 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[])
         {"alpha",       required_argument,  0, 'a'},
         {"output",      required_argument,  0, 'o'},
         {"config",      required_argument,  0, 'c'},
+        {"only-out",    no_argument,        0, 'O'},
         {0,             0,                  0,  0 },
     };
     int c;
 
     while (1) {
-        c = getopt_long(argc, argv, "hvc:e:a:o:i:f:", long_opts, NULL);
+        c = getopt_long(argc, argv, "hvOc:e:a:o:i:f:", long_opts, NULL);
 
         if (c == -1) {
             break;
@@ -143,12 +145,18 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[])
         case 'f':
             ml->file_format = optarg;
             break;
+        case 'O':
+            ml->only_out = true;
+            break;
         case 'h':
             usage(0);
+            break;
         case 'v':
             version();
+            break;
         default:
             usage(1);
+            break;
         }
     }
 
diff --git a/src/util.h b/src/util.h
index 9523ab7..dbaae15 100644
--- a/src/util.h
+++ b/src/util.h
@@ -1,6 +1,7 @@
 #ifndef UTIL_
 #define UTIL_
 
+#include <stdbool.h>
 #include <stddef.h>
 
 struct Configs {
@@ -16,6 +17,7 @@ struct Configs {
     char *file_format;
     char *in_filepath;
     char *out_filepath;
+    bool only_out;
     /* layer cfgs */
     size_t network_size;
     size_t *neurons;
-- 
cgit v1.2.3-70-g09d2