From ce0001538820d819bf965a24ffbb6f6e6269859c Mon Sep 17 00:00:00 2001 From: jvech Date: Fri, 26 Jul 2024 09:47:52 -0500 Subject: add: file_write() implemented things implemented: - CLI option --only-out added - CLI option --format works more intuitively - csv tsv and json output support --- src/main.c | 61 ++++++----------- src/parse.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- src/parse.h | 10 +++ src/util.c | 14 +++- src/util.h | 2 + 5 files changed, 227 insertions(+), 78 deletions(-) (limited to 'src') diff --git a/src/main.c b/src/main.c index e692756..216d8d4 100644 --- a/src/main.c +++ b/src/main.c @@ -34,45 +34,6 @@ #define ARRAY_SIZE(x, type) sizeof(x) / sizeof(type) -static void json_write( - const char *filepath, - Array input, Array out, - char *out_keys[], size_t out_keys_size, - char *in_keys[], size_t in_keys_size); - -void json_write( - const char *filepath, - Array input, Array out, - char *out_keys[], size_t out_keys_size, - char *in_keys[], size_t in_keys_size) -{ - FILE *fp = (!filepath) ? fopen("/dev/stdout", "w") : fopen(filepath, "w"); - if (!fp) die("json_read() Error:"); - fprintf(fp, "[\n"); - - for (size_t i = 0; i < input.shape[0]; i++) { - fprintf(fp, " {\n"); - - for (size_t j = 0; j < input.shape[1]; j++) { - size_t index = input.shape[1] * i + j; - fprintf(fp, " \"%s\": %lf,\n", in_keys[j], input.data[index]); - } - - for (size_t j = 0; j < out.shape[1]; j++) { - size_t index = out.shape[1] * i + j; - fprintf(fp, " \"%s\": %lf", out_keys[j], out.data[index]); - - if (j == out.shape[1] - 1) fprintf(fp, "\n"); - else fprintf(fp, ",\n"); - } - - if (i == input.shape[0] - 1) fprintf(fp, " }\n"); - else fprintf(fp, " },\n"); - } - fprintf(fp, "]\n"); - fclose(fp); -} - void load_config(struct Configs *cfg, int n_args, ...) { char *filepath; @@ -133,6 +94,7 @@ int main(int argc, char *argv[]) { .alpha = 1e-5, .config_filepath = "utils/settings.cfg", .network_size = 0, + .only_out = false, .file_format = NULL, .out_filepath = NULL, }; @@ -153,7 +115,10 @@ int main(int argc, char *argv[]) { Array X, y; if (!strcmp("train", argv[0])) { - file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, true, ml_configs.file_format); + file_read(argv[1], &X, &y, + ml_configs.input_keys, ml_configs.n_input_keys, + ml_configs.label_keys, ml_configs.n_label_keys, + true, ml_configs.file_format); nn_network_init_weights(network, ml_configs.network_size, X.shape[1], true); nn_network_train( network, ml_configs.network_size, @@ -165,11 +130,23 @@ int main(int argc, char *argv[]) { nn_network_write_weights(ml_configs.weights_filepath, network, ml_configs.network_size); fprintf(stderr, "weights saved on '%s'\n", ml_configs.weights_filepath); } else if (!strcmp("predict", argv[0])) { - file_read(argv[1], &X, &y, ml_configs.input_keys, ml_configs.n_input_keys, ml_configs.label_keys, ml_configs.n_label_keys, false, ml_configs.file_format); + file_read(argv[1], &X, &y, + ml_configs.input_keys, ml_configs.n_input_keys, + ml_configs.label_keys, ml_configs.n_label_keys, + false, ml_configs.file_format); nn_network_init_weights(network, ml_configs.network_size, X.shape[1], false); nn_network_read_weights(ml_configs.weights_filepath, network, ml_configs.network_size); nn_network_predict(y.data, y.shape, X.data, X.shape, network, ml_configs.network_size); - json_write(ml_configs.out_filepath, X, y, ml_configs.label_keys, ml_configs.n_label_keys, ml_configs.input_keys, ml_configs.n_input_keys); + + // If neither output and file_format defined use input to define the format + if (!ml_configs.file_format && !ml_configs.out_filepath) { + ml_configs.file_format = file_format_infer(ml_configs.in_filepath); + } + + file_write(ml_configs.out_filepath, X, y, + ml_configs.input_keys, ml_configs.n_input_keys, + ml_configs.label_keys, ml_configs.n_label_keys, + !ml_configs.only_out, ml_configs.file_format); } else usage(1); nn_network_free_weights(network, ml_configs.network_size); diff --git a/src/parse.c b/src/parse.c index 18668ec..c9b17ca 100644 --- a/src/parse.c +++ b/src/parse.c @@ -1,3 +1,21 @@ +/** + * ml - a neural network processor written with C + * Copyright (C) 2023 jvech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + #include #include #include @@ -26,6 +44,21 @@ static void csv_read( char separator ); +static void json_write( + FILE *fp, + Array input, Array out, + char *in_keys[], size_t in_keys_size, + char *out_keys[], size_t out_keys_size, + bool write_input + ); + +static void csv_write( + FILE *fp, + Array input, Array out, + bool write_input, + char separator + ); + static void csv_columns_select( double *dst_row, double *src_row, size_t selected_cols[], size_t cols_size, @@ -38,6 +71,7 @@ static void csv_readline_values( static void csv_keys2cols(size_t cols[], char *keys[], size_t keys_size); + void file_read( char *filepath, Array *input, Array *out, @@ -47,22 +81,19 @@ void file_read( char *file_format) { FILE *fp; - char *ptr; - int i, string_length; - fp = (!strcmp(filepath, "-")) ? fopen("/dev/stdin", "r") : fopen(filepath, "r"); - if (fp == NULL) die("file_read() Error:"); - - if (file_format == NULL && !strcmp(filepath, "-")) { - die("file_read() Error: on standard input the format must be defined"); + if (filepath != NULL && strcmp(filepath, "-")) { + fp = fopen(filepath, "r"); + file_format = file_format_infer(filepath); + } else { + fp = fopen("/dev/stdin", "r"); + if (file_format == NULL) { + die("file_read() Error: file format must be defined"); + } } if (file_format == NULL) { - string_length = strlen(filepath); - ptr = filepath + string_length; - for (i = string_length; i > 0 && *ptr != '.'; ptr--, i--); - if (*ptr != '.' || i == 0) die("file_read() Error: unable to infer %s format", filepath); - file_format = ptr + 1; + file_format = file_format_infer(filepath); } if (!strcmp(file_format, "csv")) csv_read(fp, input, out, in_keys, n_in_keys, out_keys, n_out_keys, read_output, false, ','); @@ -75,6 +106,38 @@ void file_read( fclose(fp); } +void file_write( + char *filepath, + Array input, Array out, + char *in_keys[], size_t n_in_keys, + char *out_keys[], size_t n_out_keys, + bool write_input, + char *file_format) +{ + FILE *fp; + + + if (filepath != NULL && strcmp(filepath, "-")) { + fp = fopen(filepath, "w"); + file_format = file_format_infer(filepath); + } else { + fp = fopen("/dev/stdout", "w"); + if (file_format == NULL) { + die("file_write() Error: file format must be defined"); + } + } + + if (fp == NULL) die("file_write() Error:"); + + if (!strcmp(file_format, "json")) json_write(fp, input, out, in_keys, n_in_keys, out_keys, n_out_keys, write_input); + else if (!strcmp(file_format, "csv")) csv_write(fp, input, out, write_input, ','); + else if (!strcmp(file_format, "tsv")) csv_write(fp, input, out, write_input, '\t'); + else { + die("file_write() Error: unable to write %s files", file_format); + } + fclose(fp); +} + void json_read( FILE *fp, Array *input, Array *out, @@ -83,11 +146,12 @@ void json_read( bool read_output) { static char fp_buffer[MAX_FILE_SIZE]; + size_t i, j, json_obj_length, index; if (fp == NULL) goto json_read_error; - size_t i = 0; + i = 0; do { if (i >= MAX_FILE_SIZE) die("json_read() Error: file size is bigger than '%zu'", i, MAX_FILE_SIZE); fp_buffer[i] = fgetc(fp); @@ -95,7 +159,7 @@ void json_read( json_object *json_obj; json_obj = json_tokener_parse(fp_buffer); - size_t json_obj_length = json_object_array_length(json_obj); + json_obj_length = json_object_array_length(json_obj); input->shape[0] = (size_t)json_obj_length; input->shape[1] = n_input_keys; @@ -107,18 +171,18 @@ void json_read( if (!input->data || !out->data) goto json_read_error; - for (int i = 0; i < json_object_array_length(json_obj); i++) { + for (i = 0; i < json_object_array_length(json_obj); i++) { json_object *item = json_object_array_get_idx(json_obj, i); - for (int j = 0; j < n_input_keys; j++) { - size_t index = n_input_keys * i + j; + for (j = 0; j < n_input_keys; j++) { + index = n_input_keys * i + j; input->data[index] = json_object_get_double(json_object_object_get(item, in_keys[j])); } if (!read_output) continue; - for (int j = 0; j < n_out_keys; j++) { - size_t index = n_out_keys * i + j; + for (j = 0; j < n_out_keys; j++) { + index = n_out_keys * i + j; out->data[index] = json_object_get_double(json_object_object_get(item, out_keys[j])); } } @@ -197,6 +261,74 @@ void csv_read( return; } +void json_write( + FILE *fp, + Array input, Array out, + char *in_keys[], size_t in_keys_size, + char *out_keys[], size_t out_keys_size, + bool write_input) +{ + fprintf(fp, "[\n"); + + if (in_keys_size != input.shape[1] && write_input) { + die("json_write() Error: there are more keys (%zu) than input columns (%zu)", + in_keys_size, input.shape[1]); + } + + if (out_keys_size != out.shape[1]) { + die("json_write() Error: there are more keys (%zu) than output columns (%zu)", + out_keys_size, out.shape[1]); + } + + for (size_t i = 0; i < input.shape[0]; i++) { + fprintf(fp, " {\n"); + + if (write_input) { + for (size_t j = 0; j < input.shape[1]; j++) { + size_t index = input.shape[1] * i + j; + fprintf(fp, " \"%s\": %lf,\n", in_keys[j], input.data[index]); + } + } + + for (size_t j = 0; j < out.shape[1]; j++) { + size_t index = out.shape[1] * i + j; + fprintf(fp, " \"%s\": %lf", out_keys[j], out.data[index]); + + if (j == out.shape[1] - 1) fprintf(fp, "\n"); + else fprintf(fp, ",\n"); + } + + if (i == input.shape[0] - 1) fprintf(fp, " }\n"); + else fprintf(fp, " },\n"); + } + fprintf(fp, "]\n"); +} + +void csv_write( + FILE *fp, + Array input, Array out, + bool write_input, + char separator + ) +{ + size_t line, col, index; + for (line = 0; line < input.shape[0]; line++) { + if (write_input) { + for (col = 0; col < input.shape[1]; col++) { + index = input.shape[1] * line + col; + fprintf(fp, "%lf%c", input.data[index], separator); + } + } + for (col = 0; col < out.shape[1]; col++) { + index = out.shape[1] * line + col; + fprintf(fp, "%lf", out.data[index]); + if (col == out.shape[1] - 1) continue; + fprintf(fp, "%c", separator); + } + fprintf(fp, "\n"); + } +} + void csv_columns_select( double *dst_row, double *src_row, size_t selected_cols[], size_t cols_size, @@ -265,11 +397,27 @@ void csv_keys2cols(size_t cols[], char *keys[], size_t keys_size) } } +char * file_format_infer(char *filename) +{ + char *file_format, *ptr; + size_t string_length, i; + + string_length = strlen(filename); + ptr = filename + string_length; + for (i = string_length; i > 0 && *ptr != '.'; ptr--, i--); + if (*ptr != '.' || i == 0) { + die("file_format_infer() Error: unable to infer %s format", filename); + } + file_format = ptr + 1; + return file_format; +} + + #ifdef PARSE_TEST #include #include /* - * compile: clang -Wall -g -DPARSE_TEST -o objs/test_parse src/util.c src/parse.c $(pkg-config --libs-only-l json-c) + * compile: clang -Wall -Wextra -g -DPARSE_TEST -o objs/test_parse src/util.c src/parse.c $(pkg-config --libs-only-l json-c) */ size_t parse_keys(char *keys[], char *argv, char key_buffer[512]) { @@ -291,47 +439,51 @@ size_t parse_keys(char *keys[], char *argv, char key_buffer[512]) } int main(int argc, char *argv[]) { - char *filename, *format; + char *in_file, *out_file, *format; size_t i, j; - if (argc < 4 || argc > 5) { + if (argc != 5 && argc != 6) { fprintf(stderr, - "Usage: parse_test FILENAME IN_KEYS OUT_KEYS [FORMAT]\n" + "Usage: parse_test IN_FILE OUT_FILE IN_KEYS OUT_KEYS [FORMAT]\n" "\nKeys format:\n" " IN_KEYS: in_key1, in_key2, ...\n" " OUT_KEYS: out_key1, out_key2, ...\n\n"); return 1; } - filename = argv[1]; + in_file = argv[1]; + out_file = argv[2]; format = NULL; - if (argc == 5) { - format = argv[4]; + if (argc == 6) { + format = argv[5]; } Array X, y; char *in_cols[32], *out_cols[32], keys_buffer[512]; size_t n_in_cols, n_out_cols; - n_in_cols = parse_keys(in_cols, argv[2], keys_buffer); - n_out_cols = parse_keys(out_cols, argv[3], keys_buffer); + n_in_cols = parse_keys(in_cols, argv[3], keys_buffer); + n_out_cols = parse_keys(out_cols, argv[4], keys_buffer); - file_read(filename, &X, &y, in_cols, 2, out_cols, 1, true, format); + file_read(in_file, &X, &y, in_cols, n_in_cols, out_cols, n_out_cols, true, format); for (i = 0; i < X.shape[0]; i++) { for (j = 0; j < X.shape[1]; j++) { - printf("%*.2e\t", 4, X.data[i * X.shape[1] + j]); + fprintf(stderr, "%.2e\t", X.data[i * X.shape[1] + j]); } for (j = 0; j < y.shape[1]; j++) { - if (j == 0) printf("|\t"); - printf("%5.2e", y.data[i * y.shape[1] + j]);; + if (j == 0) fprintf(stderr, "|\t"); + fprintf(stderr, "%.2e", y.data[i * y.shape[1] + j]);; if (j < y.shape[1] - 1) printf("\t"); } - printf("\n"); + fprintf(stderr, "\n"); } + // use input format if format variable is not defined + format = (!format && !strcmp(out_file, "-")) ? file_format_infer(in_file) : format; + file_write(out_file, X, y, in_cols, n_in_cols, out_cols, n_out_cols, true, format); for (i = 0; i < n_in_cols; i++) free(in_cols[i]); for (i = 0; i < n_out_cols; i++) free(out_cols[i]); diff --git a/src/parse.h b/src/parse.h index d5f99d0..d8aeada 100644 --- a/src/parse.h +++ b/src/parse.h @@ -18,4 +18,14 @@ void file_read( bool read_output, char *file_format ); + +void file_write( + char *filepath, + Array input, Array out, + char *in_keys[], size_t n_in_keys, + char *out_keys[], size_t n_out_keys, + bool write_input, + char *file_format); + +char * file_format_infer(char *filename); #endif diff --git a/src/util.c b/src/util.c index 8a7924f..8fa8a87 100644 --- a/src/util.c +++ b/src/util.c @@ -92,14 +92,15 @@ void usage(int exit_code) FILE *fp = (!exit_code) ? stdout : stderr; fprintf(fp, "Usage: ml train [Options] FILE\n" - " or: ml predict [-o FILE] FILE\n" + " or: ml predict [-Ohv] [-f FORMAT] [-o FILE] FILE\n" "\n" "Options:\n" " -h, --help Show this message\n" - " -f, --format=FORMAT File input and/or output format\n" + " -f, --format=FORMAT Define input or output FILE format if needed\n" " -a, --alpha=ALPHA Learning rate (only works with train)\n" " -e, --epochs=EPOCHS Epochs to train the model (only works with train)\n" " -o, --output=FILE Output file (only works with predict)\n" + " -O, --only-out Don't show input fields (only works with predict)\n" " -c, --config=FILE Configuration filepath [default=~/.config/ml/ml.cfg]\n" "\n" ); @@ -117,12 +118,13 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[]) {"alpha", required_argument, 0, 'a'}, {"output", required_argument, 0, 'o'}, {"config", required_argument, 0, 'c'}, + {"only-out", no_argument, 0, 'O'}, {0, 0, 0, 0 }, }; int c; while (1) { - c = getopt_long(argc, argv, "hvc:e:a:o:i:f:", long_opts, NULL); + c = getopt_long(argc, argv, "hvOc:e:a:o:i:f:", long_opts, NULL); if (c == -1) { break; @@ -143,12 +145,18 @@ void util_load_cli(struct Configs *ml, int argc, char *argv[]) case 'f': ml->file_format = optarg; break; + case 'O': + ml->only_out = true; + break; case 'h': usage(0); + break; case 'v': version(); + break; default: usage(1); + break; } } diff --git a/src/util.h b/src/util.h index 9523ab7..dbaae15 100644 --- a/src/util.h +++ b/src/util.h @@ -1,6 +1,7 @@ #ifndef UTIL_ #define UTIL_ +#include #include struct Configs { @@ -16,6 +17,7 @@ struct Configs { char *file_format; char *in_filepath; char *out_filepath; + bool only_out; /* layer cfgs */ size_t network_size; size_t *neurons; -- cgit v1.2.3-70-g09d2