add: file_read() and format integraded on main program

things implemented: - read output in false bug was solved. - Make generic rule added to build test executables - format option added to the CLI
author: jvech <jmvalenciae@unal.edu.co> 2024-07-24 15:31:02 -0500
committer: jvech <jmvalenciae@unal.edu.co> 2024-07-24 15:31:02 -0500
commit: d45581c0b067b9526ce88ba9d3a1bd861f4ff7cc (patch)
tree: a907346b2b282437537d7f4f6b138b3efddcce22
parent: b9deaf6ec1ba587f2b81a63c75b696c6def33436 (diff)
6 files changed, 31 insertions, 91 deletions
diff --git a/Makefile b/Makefile
index 4521115..7ca4091 100644
--- a/Makefile
+++ b/Makefile
@@ -48,9 +48,12 @@ run: build
 	@jq -r '.[] | [values[] as $$val | $$val] | @tsv' data/sample_data.json > data/sample_data.tsv
 	@gnuplot utils/plot.gpi
 
+test_%: src/%.c $(OBJDIR)
+	$(shell sed -n 's/.*compile: clang/clang/;/clang/p' $<)
+
 debug: build
 	gdb -x utils/commands.gdb --tui --args ${BIN} train data/xor.json -e 100
 	@#gdb -x utils/commands.gdb --tui --args ${BIN} predict data/sample_data.json
 
 clean:
-	@rm $(OBJS) $(OBJDIR) -rv
+	@rm $(OBJDIR) -rv
diff --git a/doc/ml.1 b/doc/ml.1
index babac62..6450709 100644
--- a/doc/ml.1
+++ b/doc/ml.1
@@ -1,10 +1,10 @@
 .\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.49.3.
-.TH ML "1" "October 2023" "ml 0.1" "User Commands"
+.TH ML "1" "July 2024" "ml 0.1" "User Commands"
 .SH NAME
 ml \- manual page for ml 0.1
 .SH SYNOPSIS
 .B ml
-\fI\,train \/\fR[\fI\,Options\/\fR] \fI\,JSON_FILE\/\fR
+\fI\,train \/\fR[\fI\,Options\/\fR] \fI\,FILE\/\fR
 .br
 .B ml
 \fI\,predict \/\fR[\fI\,-o FILE\/\fR] \fI\,FILE\/\fR
@@ -16,6 +16,9 @@ it is suitable to work on classification problems.
 \fB\-h\fR, \fB\-\-help\fR
 Show this message
 .TP
+\fB\-f\fR, \fB\-\-format\fR=\fI\,FORMAT\/\fR
+File input and/or output format
+.TP
 \fB\-a\fR, \fB\-\-alpha\fR=\fI\,ALPHA\/\fR
 Learning rate (only works with train)
 .TP
diff --git a/src/main.c b/src/main.c
index dab8bd9..e692756 100644
--- a/src/main.c
+++ b/src/main.c
@@ -26,91 +26,20 @@
 #include <json-c/json.h>
 
 #include "util.h"
+#include "parse.h"
 #include "nn.h"
 
 #define MAX_FILE_SIZE 536870912 //1<<29; 0.5 GiB
 
-typedef struct Array {
-    double *data;
-    size_t shape[2];
-} Array;
-
 #define ARRAY_SIZE(x, type) sizeof(x) / sizeof(type)
 
 
-static void json_read(
-        const char *filepath,
-        Array *input, Array *out,
-        char *out_keys[], size_t out_keys_size,
-        char *in_keys[], size_t in_keys_size,
-        bool read_output);
-
 static void json_write(
         const char *filepath,
         Array input, Array out,
         char *out_keys[], size_t out_keys_size,
         char *in_keys[], size_t in_keys_size);
 
-void json_read(
-        const char *filepath,
-        Array *input, Array *out,
-        char *out_keys[], size_t n_out_keys,
-        char *in_keys[], size_t n_input_keys,
-        bool read_output)
-{
-    FILE *fp = NULL;
-    static char fp_buffer[MAX_FILE_SIZE];
-
-    fp = (!strcmp(filepath, "-")) ? fopen("/dev/stdin", "r") : fopen(filepath, "r");
-
-    if (fp == NULL) goto json_read_error;
-
-    size_t i = 0;
-    do {
-        if (i >= MAX_FILE_SIZE) die("json_read() Error: file size is bigger than '%zu'", i, MAX_FILE_SIZE);
-        fp_buffer[i] = fgetc(fp);
-    } while (fp_buffer[i++] != EOF);
-
-    json_object *json_obj;
-    json_obj = json_tokener_parse(fp_buffer);
-    size_t json_obj_length = json_object_array_length(json_obj);
-
-    input->shape[0] = (size_t)json_obj_length;
-    input->shape[1] = n_input_keys;
-    input->data = calloc(input->shape[0] * input->shape[1], sizeof(input->data[0]));
-
-    out->shape[0] = (size_t)json_obj_length;
-    out->shape[1] = n_out_keys;
-    out->data = calloc(out->shape[0] * out->shape[1], sizeof(out->data[0]));
-
-    if (!input->data || !out->data) goto json_read_error;
-
-    for (int i = 0; i < json_object_array_length(json_obj); i++) {
-        json_object *item = json_object_array_get_idx(json_obj, i);
-
-        for (int j = 0; j < n_input_keys; j++) {
-            size_t index = n_input_keys * i + j;
-            input->data[index] = json_object_get_double(json_object_object_get(item, in_keys[j]));
-        }
-
-        if (!read_output) continue;
-
-        for (int j = 0; j < n_out_keys; j++) {
-            size_t index =  n_out_keys * i + j;
-            out->data[index] = json_object_get_double(json_object_object_get(item, out_keys[j]));
-        }
-    }
-
-    json_object_put(json_obj);
-    fclose(fp);
-
-    return;
-
-json_read_error:
-    perror("json_read() Error");
-    exit(1);
-}
-
 void json_write(
         const char *filepath,
         Array input, Array out,
@@ -204,6 +133,7 @@ int main(int argc, char *argv[]) {
         .alpha = 1e-5,
         .config_filepath = "utils/settings.cfg",
         .network_size = 0,
+        .file_format = NULL,
         .out_filepath = NULL,
     };
 
@@ -223,7 +153,7 @@ int main(int argc, char *argv[]) {
 
     Array X, y;
     if (!strcmp("train", argv[0])) {
-        json_read(argv[1], &X, &y, ml_configs.label_keys, ml_configs.n_label_keys, ml_configs.input_keys, ml_configs.n_input_keys, true);
+        file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, true, ml_configs.file_format);
         nn_network_init_weights(network, ml_configs.network_size, X.shape[1], true);
         nn_network_train(
                 network, ml_configs.network_size,
@@ -235,7 +165,7 @@ int main(int argc, char *argv[]) {
         nn_network_write_weights(ml_configs.weights_filepath, network, ml_configs.network_size);
         fprintf(stderr, "weights saved on '%s'\n", ml_configs.weights_filepath);
     } else if (!strcmp("predict", argv[0])) {
-        json_read(argv[1], &X, &y, ml_configs.label_keys, ml_configs.n_label_keys, ml_configs.input_keys, ml_configs.n_input_keys, false);
+        file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, false, ml_configs.file_format);
         nn_network_init_weights(network, ml_configs.network_size, X.shape[1], false);
         nn_network_read_weights(ml_configs.weights_filepath, network, ml_configs.network_size);
         nn_network_predict(y.data, y.shape, X.data, X.shape, network, ml_configs.network_size);
diff --git a/src/parse.c b/src/parse.c
index 8d31da5..18668ec 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -152,20 +152,16 @@ void csv_read(
     in_cols = ecalloc(n_in_cols, sizeof(size_t));
     csv_keys2cols(in_cols, in_keys, n_in_cols);
 
-    if (read_output) {
-        out_cols = ecalloc(n_out_cols, sizeof(size_t));
-        csv_keys2cols(out_cols, out_keys, n_out_cols);
-    }
+    out_cols = ecalloc(n_out_cols, sizeof(size_t));
+    csv_keys2cols(out_cols, out_keys, n_out_cols);
 
     input->shape[0] = 1;
     input->shape[1] = n_in_cols;
     input->data = ecalloc(input->shape[1], sizeof(double));
 
-    if (read_output) {
-        out->shape[0] = 1;
-        out->shape[1] = n_out_cols;
-        out->data = ecalloc(input->shape[1], sizeof(double));
-    }
+    out->shape[0] = 1;
+    out->shape[1] = n_out_cols;
+    out->data = ecalloc(out->shape[1], sizeof(double));
 
     fgets(line_buffer, 1024, fp);
     for (line_ptr = line_buffer; *line_ptr != '\0'; line_ptr++) {
@@ -178,7 +174,9 @@ void csv_read(
 
     csv_readline_values(num_buffer, num_buffer_length, line_buffer, 1, separator);
     csv_columns_select(input->data + line * input->shape[1], num_buffer, in_cols, n_in_cols, num_buffer_length);
-    if (read_output) csv_columns_select(out->data + line * out->shape[1], num_buffer, out_cols, n_out_cols, num_buffer_length);
+    if (read_output) {
+        csv_columns_select(out->data + line * out->shape[1], num_buffer, out_cols, n_out_cols, num_buffer_length);
+    }
 
     for (line = 1; fgets(line_buffer, 1024, fp) != NULL; line++) {
         csv_readline_values(num_buffer, num_buffer_length, line_buffer, line+1, separator);
@@ -187,9 +185,9 @@ void csv_read(
         input->data = erealloc(input->data, input->shape[0] * input->shape[1] * sizeof(double));
         csv_columns_select(input->data + line * input->shape[1], num_buffer, in_cols, n_in_cols, num_buffer_length);
 
+        out->shape[0]++;
+        out->data = erealloc(out->data, out->shape[0] * out->shape[1] * sizeof(double));
         if (read_output) {
-            out->shape[0]++;
-            out->data = erealloc(out->data, out->shape[0] * out->shape[1] * sizeof(double));
             csv_columns_select(out->data + line * out->shape[1], num_buffer, out_cols, n_out_cols, num_buffer_length);
         }
     }
diff --git a/src/util.c b/src/util.c
index cd87d5c..8a7924f 100644
--- a/src/util.c
+++ b/src/util.c
@@ -91,11 +91,12 @@ void usage(int exit_code)
 {
     FILE *fp = (!exit_code) ? stdout : stderr;
     fprintf(fp,
-            "Usage: ml train [Options] JSON_FILE\n"
+            "Usage: ml train [Options] FILE\n"
             "   or: ml predict [-o FILE] FILE\n"
             "\n"
             "Options:\n"
             "  -h, --help               Show this message\n"
+            "  -f, --format=FORMAT      File input and/or output format\n"
             "  -a, --alpha=ALPHA        Learning rate (only works with train)\n"
             "  -e, --epochs=EPOCHS      Epochs to train the model (only works with train)\n"
             "  -o, --output=FILE        Output file (only works with predict)\n"
@@ -111,6 +112,7 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[])
     static struct option long_opts[] = {
         {"help",        no_argument,        0, 'h'},
         {"version",     no_argument,        0, 'v'},
+        {"format",      required_argument,  0, 'f'},
         {"epochs",      required_argument,  0, 'e'},
         {"alpha",       required_argument,  0, 'a'},
         {"output",      required_argument,  0, 'o'},
@@ -120,7 +122,7 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[])
     int c;
 
     while (1) {
-        c = getopt_long(argc, argv, "hvc:e:a:o:i:l:", long_opts, NULL);
+        c = getopt_long(argc, argv, "hvc:e:a:o:i:f:", long_opts, NULL);
 
         if (c == -1) {
             break;
@@ -138,6 +140,9 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[])
         case 'c':
             ml->config_filepath = optarg;
             break;
+        case 'f':
+            ml->file_format = optarg;
+            break;
         case 'h':
             usage(0);
         case 'v':
diff --git a/src/util.h b/src/util.h
index a3ef908..9523ab7 100644
--- a/src/util.h
+++ b/src/util.h
@@ -13,6 +13,7 @@ struct Configs {
     char *weights_filepath;
     char *config_filepath;
     /* cli cfgs */
+    char *file_format;
     char *in_filepath;
     char *out_filepath;
     /* layer cfgs */
author	jvech <jmvalenciae@unal.edu.co>	2024-07-24 15:31:02 -0500
committer	jvech <jmvalenciae@unal.edu.co>	2024-07-24 15:31:02 -0500
commit	d45581c0b067b9526ce88ba9d3a1bd861f4ff7cc (patch)
tree	a907346b2b282437537d7f4f6b138b3efddcce22
parent	b9deaf6ec1ba587f2b81a63c75b696c6def33436 (diff)