/** * ml - a neural network processor written with C * Copyright (C) 2023 jvech * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include "util.h" #include "parse.h" #define MAX_FILE_SIZE 536870912 //1<<29; 0.5 GiB static void json_read( FILE *fp, Array *input, Array *out, struct Configs cfgs, bool read_output ); static void csv_read( FILE *fp, Array *input, Array *out, struct Configs cfgs, bool read_output, char *separator ); static void json_write( FILE *fp, Array input, Array out, struct Configs cfgs ); static void csv_write( FILE *fp, Array input, Array out, struct Configs cfgs, char *separator ); void file_read( char *filepath, Array *input, Array *out, struct Configs ml_config, bool read_output) { FILE *fp; char *file_format = ml_config.file_format; if (filepath != NULL && strcmp(filepath, "-")) { fp = fopen(filepath, "r"); file_format = file_format_infer(filepath); } else { fp = fopen("/dev/stdin", "r"); if (file_format == NULL) { die("file_read() Error: file format must be defined"); } } if (file_format == NULL) { file_format = file_format_infer(filepath); } if (!strcmp(file_format, "csv")) csv_read(fp, input, out, ml_config, read_output, ","); else if (!strcmp(file_format, "tsv")) csv_read(fp, input, out, ml_config, read_output, "\t"); else if (!strcmp(file_format, "json")) json_read(fp, input, out, ml_config, read_output); else { die("file_read() Error: unable to parse %s files", file_format); } fclose(fp); } void file_write(Array input, Array out, struct Configs ml_config) { FILE *fp; char *filepath = ml_config.out_filepath; char *file_format = ml_config.file_format; if (filepath != NULL && strcmp(filepath, "-")) { fp = fopen(filepath, "w"); file_format = file_format_infer(filepath); } else { fp = fopen("/dev/stdout", "w"); if (file_format == NULL) { die("file_write() Error: file format must be defined"); } } if (fp == NULL) die("file_write() Error:"); if (!strcmp(file_format, "json")) json_write(fp, input, out, ml_config); else if (!strcmp(file_format, "csv")) csv_write(fp, input, out, ml_config, ","); else if (!strcmp(file_format, "tsv")) csv_write(fp, input, out, ml_config, "\t"); else { die("file_write() Error: unable to write %s files", file_format); } fclose(fp); } void data_postprocess( Array *out, double *data, size_t data_shape[2], struct Configs cfgs, bool is_input) { char **keys = (is_input) ? cfgs.input_keys : cfgs.label_keys; size_t n_keys = (is_input) ? cfgs.n_input_keys : cfgs.n_label_keys; char **categorical_keys = cfgs.categorical_keys; size_t n_categorical_keys = cfgs.n_categorical_keys; char ***categorical_values = cfgs.categorical_values; size_t *n_categorical_values = cfgs.n_categorical_values; size_t i, j, data_j; for (data_j = j = 0; j < n_keys; j++) { int k; switch (out->type[j]) { case ARRAY_NUMERICAL: for (i = 0; i < data_shape[0]; i++) { size_t data_index = i * data_shape[1] + data_j; size_t index = i * out->shape[1] + j; out->data[index].numeric = data[data_index]; } data_j++; break; case ARRAY_ONEHOT: k = util_get_key_index(keys[j], categorical_keys, n_categorical_keys); if (k == -1) { die("data_postprocess() Error: field '%s' is not registered as categorical", keys[j]); } for (i = 0; i < data_shape[0]; i++) { size_t index = i * out->shape[1] + j; size_t data_index = i * data_shape[1] + data_j; int onehot_i = util_argmax(data + data_index, n_categorical_values[k]); out->data[index].categorical = e_strdup(categorical_values[k][onehot_i]); } data_j += n_categorical_values[k]; break; default: die("data_postprocess() Error: unexpected type received on '%s' field", keys[j]); } } } double * data_preprocess( size_t out_shape[2], Array data, struct Configs cfgs, bool is_input, bool only_allocate) { double *out; char **keys = (is_input) ? cfgs.input_keys : cfgs.label_keys; size_t n_keys = (is_input) ? cfgs.n_input_keys : cfgs.n_label_keys; char **categorical_keys = cfgs.categorical_keys; size_t n_categorical_keys = cfgs.n_categorical_keys; char ***categorical_values = cfgs.categorical_values; size_t *n_categorical_values = cfgs.n_categorical_values; size_t i, j, out_j; out_shape[0] = data.shape[0]; out_shape[1] = 0; for (i = 0; i < n_keys; i++) { int n; switch (data.type[i]) { case ARRAY_NUMERICAL: out_shape[1]++; break; case ARRAY_ONEHOT: n = util_get_key_index(keys[i], categorical_keys, n_categorical_keys); if (n == -1) die("data_preprocess() Error: field '%s' is not marked as categorical", keys[i]); out_shape[1] += n_categorical_values[n]; break; default: die("data_preprocess() Error: field '%s' has an unknown type", keys[i]); break; } } out = ecalloc(out_shape[0] * out_shape[1], sizeof(double)); if (only_allocate) return out; for (out_j = j = 0; j < data.shape[1]; j++) { switch (data.type[j]) { int k; case ARRAY_NUMERICAL: for (i = 0; i < out_shape[0]; i++) { size_t index = i * data.shape[1] + j; size_t out_index = i * out_shape[1] + out_j; out[out_index] = data.data[index].numeric; } out_j++; break; case ARRAY_ONEHOT: k = util_get_key_index(keys[j], categorical_keys, n_categorical_keys); for (i = 0; i < out_shape[0]; i++) { int onehot_i; size_t index = i * data.shape[1] + j; onehot_i = util_get_key_index(data.data[index].categorical, categorical_values[k], n_categorical_values[k]); if (onehot_i == -1) { die("data_preprocess() Error: unexpected '%s' value found", data.data[index].categorical); } size_t out_index = i * out_shape[1] + out_j + onehot_i; out[out_index] = 1.0; } out_j += n_categorical_values[k]; break; default: die("data_preprocess() Error: field '%s' has an unknown type", keys[j]); } } return out; } void array_free(Array *x) { size_t i, j, index; for (j = 1; j < x->shape[1]; j++) { switch (x->type[j]) { case ARRAY_ORDINAL: case ARRAY_ONEHOT: for (i = 0; i < x->shape[0]; i++) { index = x->shape[1] * i + j; free(x->data[index].categorical); } break; default: break; } } free(x->type); free(x->data); } void json_read( FILE *fp, Array *input, Array *out, struct Configs cfgs, bool read_output) { static char fp_buffer[MAX_FILE_SIZE]; size_t i, j, json_obj_length, index; json_object *json_obj, *item, *value; json_type obj_type; char **in_keys = cfgs.input_keys; char **out_keys = cfgs.label_keys; char **onehot_keys = cfgs.onehot_keys; size_t n_input_keys = cfgs.n_input_keys; size_t n_out_keys = cfgs.n_label_keys; size_t n_onehot_keys = cfgs.n_onehot_keys; if (fp == NULL) die("json_read() Error:"); i = 0; do { if (i >= MAX_FILE_SIZE) die("json_read() Error: file size is bigger than '%zu'", i, MAX_FILE_SIZE); fp_buffer[i] = fgetc(fp); } while (fp_buffer[i++] != EOF); json_obj = json_tokener_parse(fp_buffer); if (!json_object_is_type(json_obj, json_type_array)) { die("json_read() Error: unexpected JSON data received, expecting an array"); } json_obj_length = json_object_array_length(json_obj); input->shape[0] = (size_t)json_obj_length; input->shape[1] = n_input_keys; input->type = ecalloc(input->shape[1], sizeof(enum ArrayType)); input->data = ecalloc(input->shape[0] * input->shape[1], sizeof(input->data[0])); out->shape[0] = (size_t)json_obj_length; out->shape[1] = n_out_keys; out->type = ecalloc(out->shape[1], sizeof(enum ArrayType)); out->data = ecalloc(out->shape[0] * out->shape[1], sizeof(out->data[0])); for (i = 0; i < n_onehot_keys; i++) { for (j = 0; j < n_input_keys; j++) { if (!strcmp(onehot_keys[i], in_keys[j])) { input->type[j] = ARRAY_ONEHOT; } } for (j = 0; j < n_out_keys; j++) { if (!strcmp(onehot_keys[i], out_keys[j])) { out->type[j] = ARRAY_ONEHOT; } } } for (i = 0; i < json_object_array_length(json_obj); i++) { item = json_object_array_get_idx(json_obj, i); if (!json_object_is_type(item, json_type_object)) { die("json_read() Error: unexpected JSON data received, expecting an object"); } if ((size_t)json_object_object_length(item) < n_input_keys + n_out_keys) { die("json_read() Error: the number of keys required is greater " "than the keys available in the object:\n%s", json_object_to_json_string_ext(item, JSON_C_TO_STRING_PRETTY)); } for (j = 0; j < n_input_keys; j++) { value = json_object_object_get(item, in_keys[j]); obj_type = json_object_get_type(value); index = n_input_keys * i + j; switch (input->type[j]) { case ARRAY_NUMERICAL: switch (obj_type) { case json_type_int: case json_type_double: input->data[index].numeric = json_object_get_double(value); break; default: die("json_read() Error: unexpected JSON data received, expecting a number"); } break; case ARRAY_ONEHOT: switch (obj_type) { case json_type_int: case json_type_string: input->data[index].categorical = e_strdup(json_object_get_string(value)); break; default: die("json_read() Error: unexpected JSON data received, expecting a string or integer"); } break; default: die("json_read() Error: preprocess field type '%s' is not implemented", in_keys[j]); } } if (!read_output) continue; for (j = 0; j < n_out_keys; j++) { value = json_object_object_get(item, out_keys[j]); obj_type = json_object_get_type(value); index = n_out_keys * i + j; switch (out->type[j]) { case ARRAY_NUMERICAL: switch (obj_type) { case json_type_int: case json_type_double: out->data[index].numeric = json_object_get_double(value); break; default: die("json_read() Error: unexpected JSON data received, expecting a number"); } break; case ARRAY_ONEHOT: switch (obj_type) { case json_type_int: case json_type_string: out->data[index].categorical = e_strdup(json_object_get_string(value)); break; default: die("json_read() Error: unexpected JSON data received, expecting string or integer"); } break; default: die("json_read() Error: preprocess field type '%s' is not implemented", out_keys[j]); } } } json_object_put(json_obj); return; } void csv_read( FILE *fp, Array *input, Array *out, struct Configs cfgs, bool read_output, char *separator) { char *line = NULL, *line_buffer, **values_buffer; size_t line_number = 0, line_size = 0; size_t n_values_buffer; size_t *in_indexes, *out_indexes; bool has_header = true; char **in_keys, **out_keys, **onehot_keys; size_t n_in_keys, n_out_keys, n_onehot_keys; in_keys = cfgs.input_keys; out_keys = cfgs.label_keys; onehot_keys = cfgs.onehot_keys; n_in_keys = cfgs.n_input_keys; n_out_keys = cfgs.n_label_keys; n_onehot_keys = cfgs.n_onehot_keys; n_values_buffer = n_in_keys + n_out_keys; values_buffer = ecalloc(n_values_buffer, sizeof(char *)); in_indexes = ecalloc(n_in_keys, sizeof(size_t)); out_indexes = ecalloc(n_out_keys, sizeof(size_t)); if (fp == NULL) die("csv_read() Error:"); input->type = ecalloc(n_in_keys, sizeof(enum ArrayType)); out->type = ecalloc(n_out_keys, sizeof(enum ArrayType)); input->data = NULL; out->data = NULL; input->shape[0] = out->shape[0] = 0; input->shape[1] = n_in_keys; out->shape[1] = n_out_keys; for (size_t i = 0; i < n_in_keys; i++) { int ret = util_get_key_index(in_keys[i], onehot_keys, n_onehot_keys); if (ret >= 0) input->type[i] = ARRAY_ONEHOT; } for (size_t i = 0; i < n_out_keys; i++) { int ret = util_get_key_index(out_keys[i], onehot_keys, n_onehot_keys); if (ret >= 0) out->type[i] = ARRAY_ONEHOT; } while (getline(&line, &line_size, fp) != -1) { /* Get line values */ char *value; size_t cols = 0; line_buffer = line; *(strstr(line, "\n")) = '\0'; //strip new line character e.g ("line text\n" -> "line text") while ((value = strsep(&line_buffer, separator))) { if (cols == n_values_buffer && line_number == 0) { n_values_buffer++; values_buffer = erealloc(values_buffer, n_values_buffer * sizeof(char *)); } else if (cols == n_values_buffer) { die("csv_read() Error: line %d has different columns than other lines", line_number); } values_buffer[cols++] = value; } /* Set up keys indexes */ if (line_number == 0) { size_t i; int key_index; for (i = 0; i < n_in_keys && has_header; i++) { key_index = util_get_key_index(in_keys[i], values_buffer, n_values_buffer); if (key_index == -1) has_header = false; } for (i = 0; i < n_out_keys && read_output && has_header; i++) { key_index = util_get_key_index(out_keys[i], values_buffer, n_values_buffer); if (key_index == -1) has_header = false; } for (i = 0; i < n_in_keys; i++) { key_index = util_get_key_index(in_keys[i], values_buffer, n_values_buffer); in_indexes[i] = has_header ? (size_t)key_index : i; } for (i = 0; i < n_out_keys && read_output; i++) { key_index = util_get_key_index(out_keys[i], values_buffer, n_values_buffer); out_indexes[i] = has_header ? (size_t)key_index : i + n_in_keys; } } if (has_header && !line_number) { line_number++; continue; } /* Allocate memory for the data */ input->data = erealloc(input->data, (input->shape[0] + 1) * n_in_keys * sizeof(union ArrayValue)); out->data = erealloc(out->data, (out->shape[0] + 1) * n_out_keys * sizeof(union ArrayValue)); /* Fill the data */ int ret; size_t i, j, index; for (i = 0; i < n_in_keys; i++) { ret = 0; j = in_indexes[i]; index = input->shape[0] * n_in_keys + i; switch (input->type[i]) { case ARRAY_NUMERICAL: ret = sscanf(values_buffer[j], "%lf", &input->data[index].numeric); if (ret < 1) die("csv_read() Error: expecting a number not '%s'", values_buffer[j]); break; case ARRAY_ONEHOT: ret = sscanf(values_buffer[j], "%lf", &input->data[index].numeric); if (ret >= 1) die("csv_read() Error: expecting a string or integer not '%s'", values_buffer[j]); input->data[index].categorical = e_strdup(values_buffer[j]); break; default: die("csv_read() Error: field '%s' has an unexpected type", in_keys[i]); } } for (i = 0; i < n_out_keys && read_output; i++) { ret = 0; j = out_indexes[i]; index = out->shape[0] * n_out_keys + i; switch (out->type[i]) { case ARRAY_NUMERICAL: ret = sscanf(values_buffer[j], "%lf", &out->data[index].numeric); if (ret < 1) die("csv_read() Error: expecting a number not '%s'", values_buffer[j]); break; case ARRAY_ONEHOT: out->data[index].categorical = e_strdup(values_buffer[j]); break; default: die("csv_read() Error: field '%s' has an unexpected type", out_keys[i]); } } input->shape[0]++; out->shape[0]++; line_number++; } if (errno != 0) die("csv_read() Error:"); free(line); free(in_indexes); free(out_indexes); free(values_buffer); } void json_write( FILE *fp, Array input, Array out, struct Configs cfgs) { char **in_keys = cfgs.input_keys; char **out_keys = cfgs.label_keys; size_t n_in_keys = cfgs.n_input_keys; size_t n_out_keys = cfgs.n_label_keys; bool write_input = !cfgs.only_out; int decimal_precision = cfgs.decimal_precision; json_object *root = json_object_new_array(); if (!root) { die("json_write() Error: Unable to create json_data"); } if (n_in_keys != input.shape[1]) die("json_write() Error: input keys and data columns have different sizes"); if (n_out_keys != out.shape[1]) die("json_write() Error: output keys and data columns have different sizes"); size_t i, j; for (i = 0; i < input.shape[0]; i++) { json_object *obj = json_object_new_object(); if (write_input) { for (j = 0; j < input.shape[1]; j++) { char buffer[128]; size_t index = i * input.shape[1] + j; switch (input.type[j]) { case ARRAY_NUMERICAL: sprintf(buffer, "%g", input.data[index].numeric); json_object_object_add(obj, in_keys[j], json_object_new_double_s(input.data[index].numeric, buffer)); break; case ARRAY_ONEHOT: json_object_object_add(obj, in_keys[j], json_object_new_string(input.data[index].categorical)); break; default: die("json_write(): Unexpected value received"); } } } for (j = 0; j < out.shape[1]; j++) { size_t index = i * out.shape[1] + j; char buffer[32]; switch (out.type[j]) { case ARRAY_NUMERICAL: sprintf(buffer, "%.*g", decimal_precision, out.data[index].numeric); json_object_object_add(obj, out_keys[j], json_object_new_double_s(out.data[index].numeric, buffer)); break; case ARRAY_ONEHOT: json_object_object_add(obj, out_keys[j], json_object_new_string(out.data[index].categorical)); break; default: die("json_write(): Unexpected value received"); } } json_object_array_add(root, obj); } int ret = fprintf(fp, "%s", json_object_to_json_string_ext(root, JSON_C_TO_STRING_PRETTY | JSON_C_TO_STRING_SPACED)); if (ret == -1) { die("json_write() Error: unable to write json data"); } json_object_put(root); } void csv_write( FILE *fp, Array input, Array out, struct Configs cfgs, char *separator) { int decimal_precision = cfgs.decimal_precision; bool write_input = !cfgs.only_out; size_t i,j,index; for (j = 0; j < cfgs.n_input_keys && write_input; j++) { fprintf(fp, "%s%s", cfgs.input_keys[j], separator); } for (j = 0; j < cfgs.n_label_keys; j++) { fprintf(fp, "%s", cfgs.label_keys[j]); if (j == cfgs.n_label_keys - 1) fprintf(fp, "\n"); else fprintf(fp, "%s", separator); } for (i = 0; i < input.shape[0]; i++) { for (j = 0; j < input.shape[1] && write_input; j++) { index = i * out.shape[1] + j; switch (input.type[j] ) { case ARRAY_NUMERICAL: fprintf(fp, "%.*g%s", decimal_precision, input.data[index].numeric, separator); break; case ARRAY_ONEHOT: fprintf(fp, "%s%s", input.data[index].categorical, separator); break; default: die("csv_write() Error: Unexpected type found on field '%s'", cfgs.input_keys[j]); } } for (j = 0; j < out.shape[1]; j++) { index = i * out.shape[1] + j; switch (out.type[j] ) { case ARRAY_NUMERICAL: fprintf(fp, "%.*g", decimal_precision, out.data[index].numeric); break; case ARRAY_ONEHOT: fprintf(fp, "%s", out.data[index].categorical); break; default: die("csv_write() Error: Unexpected type found on field '%s'", cfgs.label_keys[j]); } if (j == out.shape[1] - 1) fprintf(fp, "\n"); else fprintf(fp, "%s", separator); } } } char * file_format_infer(char *filename) { char *file_format, *ptr; size_t string_length, i; string_length = strlen(filename); ptr = filename + string_length; for (i = string_length; i > 0 && *ptr != '.'; ptr--, i--); if (*ptr != '.' || i == 0) { die("file_format_infer() Error: unable to infer %s format", filename); } file_format = ptr + 1; return file_format; }