From 6b2cea33c5a5f2af90eec721ff26c4ff9de468dc Mon Sep 17 00:00:00 2001 From: jvech Date: Sat, 14 Sep 2024 19:46:36 -0500 Subject: feat: onehot feature implementation in process To make onehot available I had to refactor how the data is stored, the current implementation supports json files. --- src/util.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) (limited to 'src/util.c') diff --git a/src/util.c b/src/util.c index 81950f1..ad14632 100644 --- a/src/util.c +++ b/src/util.c @@ -25,9 +25,13 @@ #include "util.h" #define BUFFER_SIZE 1024 + +static int cmpstringp(const void *, const void *); static char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr); static void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath); static void load_lyr_cfgs(struct Configs *cfg, char *key, char *value, char *filepath); +static void load_categorical_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr); +static void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath); static void add_lyr(struct Configs *cfg); void die(const char *fmt, ...) @@ -207,11 +211,34 @@ void util_free_config(struct Configs *ml) free(ml->activations[i]); free(ml->activations); } + + if (ml->onehot_keys != NULL) { + for (size_t i = 0; i < ml->n_onehot_keys; i++) + free(ml->onehot_keys[i]); + free(ml->onehot_keys); + } + + if (ml->categorical_keys != NULL) { + for (size_t i = 0; i < ml->n_categorical_keys; i++) + free(ml->categorical_keys[i]); + free(ml->categorical_keys); + } + + if (ml->categorical_values != NULL) { + for (size_t i = 0; i < ml->n_categorical_keys; i++) { + for (size_t j = 0; j < ml->n_categorical_values[i]; j++) { + free(ml->categorical_values[i][j]); + } + free(ml->categorical_values[i]); + } + free(ml->n_categorical_values); + free(ml->categorical_values); + } } void util_load_config(struct Configs *ml, char *filepath) { - enum Section {NET, LAYER, OUT_LAYER}; + enum Section {NET, PREPROCESSING, CATEGORICAL, LAYER, OUT_LAYER}; enum Section section; int line_number = 0; char line_buffer[BUFFER_SIZE], line_buffer_original[BUFFER_SIZE]; @@ -234,6 +261,10 @@ void util_load_config(struct Configs *ml, char *filepath) ml->network_size++; add_lyr(ml); ml->neurons[ml->network_size-1] = ml->n_label_keys; + } else if (!strcmp("preprocessing", token_buffer)) { + section = PREPROCESSING; + } else if (!strcmp("categorical_fields", token_buffer)) { + section = CATEGORICAL; } else { die("util_load_config() Error: Unknown section '%s' on %s", line_buffer, filepath); @@ -277,6 +308,12 @@ void util_load_config(struct Configs *ml, char *filepath) case NET: load_net_cfgs(ml, key, value, ptr_buffer, filepath); break; + case PREPROCESSING: + load_preprocess_cfgs(ml, key, value, ptr_buffer, filepath); + break; + case CATEGORICAL: + load_categorical_cfgs(ml, key, value, ptr_buffer); + break; case LAYER: load_lyr_cfgs(ml, key, value, filepath); break; @@ -292,6 +329,48 @@ void util_load_config(struct Configs *ml, char *filepath) break; } } + + /* Checks categorical_keys in label_keys or input_keys or onehot_keys*/ + size_t i,j,k; + for (i = 0; i < ml->n_categorical_keys; i++) { + int ret; + ret = util_get_key_index(ml->categorical_keys[i], ml->input_keys, ml->n_input_keys); + if (ret >= 0) continue; + ret = util_get_key_index(ml->categorical_keys[i], ml->label_keys, ml->n_label_keys); + if (ret == -1) { + die("util_load_config() Error: field '%s' does not exist", ml->categorical_keys[i]); + } + + ret = util_get_key_index(ml->categorical_keys[i], ml->onehot_keys, ml->n_onehot_keys); + if (ret >= 0) continue; + die("util_load_config() Error: field '%s' must be encoded", ml->categorical_keys[i]); + } + + /* Check onehot_keys in categorical_keys */ + for (i = 0; i < ml->n_onehot_keys; i++) { + int ret = util_get_key_index(ml->onehot_keys[i], + ml->categorical_keys, + ml->n_categorical_keys); + if (ret >= 0) continue; + die("util_load_config() Error: one hot field '%s' is not defined as categorical", ml->onehot_keys[i]); + } + + /* Determine out layer neurons */ + size_t *out_layer_neurons = ml->neurons + ml->network_size - 1; + *out_layer_neurons = 0; + for (i = 0; i < ml->n_label_keys; i++) { + int ret = 1; + + for (j = 0; ret && j < ml->n_categorical_keys; j++) + ret = strcmp(ml->categorical_keys[j], ml->label_keys[i]); + + for (k = 0; ret && k < ml->n_onehot_keys; k++) + ret = strcmp(ml->onehot_keys[k], ml->label_keys[i]); + + *out_layer_neurons += (!ret) ? ml->n_categorical_values[i] : 1; + } + + fclose(fp); return; @@ -331,7 +410,40 @@ void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr else if (!strcmp(key, "alpha")) cfg->alpha = (double)atof(value); else if (!strcmp(key, "inputs")) cfg->input_keys = config_read_values(&(cfg->n_input_keys), value, &strtok_ptr); else if (!strcmp(key, "labels")) cfg->label_keys = config_read_values(&(cfg->n_label_keys), value, &strtok_ptr); - else die("util_load_config() Error: Unknown parameter '%s' on file %s.", key, filepath); + else die("util_load_config() Error: Invalid parameter '%s' in [net] section on file %s.", key, filepath); +} + +void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath) +{ + if (!strcmp(key, "onehot")) cfg->onehot_keys = config_read_values(&cfg->n_onehot_keys, value, &strtok_ptr); + else die("util_load_config() Error: Invalid parameter '%s' in [preprocess] section on file %s", key, filepath); +} + + +void load_categorical_cfgs( + struct Configs *cfg, + char *key, char *value, + char *strtok_ptr) +{ + size_t size, *value_size; + + size = cfg->n_categorical_keys; + if (cfg->n_categorical_keys == 0) { + cfg->categorical_keys = ecalloc(1, sizeof(char *)); + cfg->categorical_values = ecalloc(1, sizeof(char **)); + cfg->n_categorical_values = ecalloc(1, sizeof(size_t)); + cfg->n_categorical_keys++; + } else { + cfg->categorical_keys = erealloc(cfg->categorical_keys, sizeof(char *) * (size + 1)); + cfg->categorical_values = erealloc(cfg->categorical_values, sizeof(char *) * (size + 1)); + cfg->n_categorical_values = erealloc(cfg->n_categorical_values, sizeof(size_t) * (size + 1)); + cfg->n_categorical_keys++; + } + + value_size = cfg->n_categorical_values + size; + cfg->categorical_keys[size] = e_strdup(key); + cfg->categorical_values[size] = config_read_values(value_size, value, &strtok_ptr); + qsort(cfg->categorical_values[size], *value_size, sizeof(char *), cmpstringp); } char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr) @@ -348,4 +460,28 @@ char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ } return out_keys; } + +int util_get_key_index(char *key, char **keys, size_t n_keys) +{ + int i; + for (i = 0; (size_t)i < n_keys; i++) + if (!strcmp(key, keys[i])) return i; + return -1; +} + +int util_argmax(double *values, size_t n_values) +{ + double value = values[0]; + size_t i, j; + for (i = j = 0; i < n_values; i++) { + if (values[i] > value) j = i; + value = values[i]; + } + return j; +} + +int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(*(const char **) p1, *(const char **) p2); +} #undef BUFFER_SIZE -- cgit v1.2.3-70-g09d2