aboutsummaryrefslogtreecommitdiff
path: root/src/util.c
diff options
context:
space:
mode:
authorjvech <jmvalenciae@unal.edu.co>2024-09-14 19:46:36 -0500
committerjvech <jmvalenciae@unal.edu.co>2024-09-14 19:46:36 -0500
commit6b2cea33c5a5f2af90eec721ff26c4ff9de468dc (patch)
treeee2e9dc8e4ef884f13a59867493cfbf03f900df5 /src/util.c
parent97ac0db6b070b11652c3c8b181ff9d2ddfe53f17 (diff)
feat: onehot feature implementation in process
To make onehot available I had to refactor how the data is stored, the current implementation supports json files.
Diffstat (limited to 'src/util.c')
-rw-r--r--src/util.c140
1 files changed, 138 insertions, 2 deletions
diff --git a/src/util.c b/src/util.c
index 81950f1..ad14632 100644
--- a/src/util.c
+++ b/src/util.c
@@ -25,9 +25,13 @@
#include "util.h"
#define BUFFER_SIZE 1024
+
+static int cmpstringp(const void *, const void *);
static char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr);
static void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath);
static void load_lyr_cfgs(struct Configs *cfg, char *key, char *value, char *filepath);
+static void load_categorical_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr);
+static void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath);
static void add_lyr(struct Configs *cfg);
void die(const char *fmt, ...)
@@ -207,11 +211,34 @@ void util_free_config(struct Configs *ml)
free(ml->activations[i]);
free(ml->activations);
}
+
+ if (ml->onehot_keys != NULL) {
+ for (size_t i = 0; i < ml->n_onehot_keys; i++)
+ free(ml->onehot_keys[i]);
+ free(ml->onehot_keys);
+ }
+
+ if (ml->categorical_keys != NULL) {
+ for (size_t i = 0; i < ml->n_categorical_keys; i++)
+ free(ml->categorical_keys[i]);
+ free(ml->categorical_keys);
+ }
+
+ if (ml->categorical_values != NULL) {
+ for (size_t i = 0; i < ml->n_categorical_keys; i++) {
+ for (size_t j = 0; j < ml->n_categorical_values[i]; j++) {
+ free(ml->categorical_values[i][j]);
+ }
+ free(ml->categorical_values[i]);
+ }
+ free(ml->n_categorical_values);
+ free(ml->categorical_values);
+ }
}
void util_load_config(struct Configs *ml, char *filepath)
{
- enum Section {NET, LAYER, OUT_LAYER};
+ enum Section {NET, PREPROCESSING, CATEGORICAL, LAYER, OUT_LAYER};
enum Section section;
int line_number = 0;
char line_buffer[BUFFER_SIZE], line_buffer_original[BUFFER_SIZE];
@@ -234,6 +261,10 @@ void util_load_config(struct Configs *ml, char *filepath)
ml->network_size++;
add_lyr(ml);
ml->neurons[ml->network_size-1] = ml->n_label_keys;
+ } else if (!strcmp("preprocessing", token_buffer)) {
+ section = PREPROCESSING;
+ } else if (!strcmp("categorical_fields", token_buffer)) {
+ section = CATEGORICAL;
} else {
die("util_load_config() Error: Unknown section '%s' on %s",
line_buffer, filepath);
@@ -277,6 +308,12 @@ void util_load_config(struct Configs *ml, char *filepath)
case NET:
load_net_cfgs(ml, key, value, ptr_buffer, filepath);
break;
+ case PREPROCESSING:
+ load_preprocess_cfgs(ml, key, value, ptr_buffer, filepath);
+ break;
+ case CATEGORICAL:
+ load_categorical_cfgs(ml, key, value, ptr_buffer);
+ break;
case LAYER:
load_lyr_cfgs(ml, key, value, filepath);
break;
@@ -292,6 +329,48 @@ void util_load_config(struct Configs *ml, char *filepath)
break;
}
}
+
+ /* Checks categorical_keys in label_keys or input_keys or onehot_keys*/
+ size_t i,j,k;
+ for (i = 0; i < ml->n_categorical_keys; i++) {
+ int ret;
+ ret = util_get_key_index(ml->categorical_keys[i], ml->input_keys, ml->n_input_keys);
+ if (ret >= 0) continue;
+ ret = util_get_key_index(ml->categorical_keys[i], ml->label_keys, ml->n_label_keys);
+ if (ret == -1) {
+ die("util_load_config() Error: field '%s' does not exist", ml->categorical_keys[i]);
+ }
+
+ ret = util_get_key_index(ml->categorical_keys[i], ml->onehot_keys, ml->n_onehot_keys);
+ if (ret >= 0) continue;
+ die("util_load_config() Error: field '%s' must be encoded", ml->categorical_keys[i]);
+ }
+
+ /* Check onehot_keys in categorical_keys */
+ for (i = 0; i < ml->n_onehot_keys; i++) {
+ int ret = util_get_key_index(ml->onehot_keys[i],
+ ml->categorical_keys,
+ ml->n_categorical_keys);
+ if (ret >= 0) continue;
+ die("util_load_config() Error: one hot field '%s' is not defined as categorical", ml->onehot_keys[i]);
+ }
+
+ /* Determine out layer neurons */
+ size_t *out_layer_neurons = ml->neurons + ml->network_size - 1;
+ *out_layer_neurons = 0;
+ for (i = 0; i < ml->n_label_keys; i++) {
+ int ret = 1;
+
+ for (j = 0; ret && j < ml->n_categorical_keys; j++)
+ ret = strcmp(ml->categorical_keys[j], ml->label_keys[i]);
+
+ for (k = 0; ret && k < ml->n_onehot_keys; k++)
+ ret = strcmp(ml->onehot_keys[k], ml->label_keys[i]);
+
+ *out_layer_neurons += (!ret) ? ml->n_categorical_values[i] : 1;
+ }
+
+
fclose(fp);
return;
@@ -331,7 +410,40 @@ void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr
else if (!strcmp(key, "alpha")) cfg->alpha = (double)atof(value);
else if (!strcmp(key, "inputs")) cfg->input_keys = config_read_values(&(cfg->n_input_keys), value, &strtok_ptr);
else if (!strcmp(key, "labels")) cfg->label_keys = config_read_values(&(cfg->n_label_keys), value, &strtok_ptr);
- else die("util_load_config() Error: Unknown parameter '%s' on file %s.", key, filepath);
+ else die("util_load_config() Error: Invalid parameter '%s' in [net] section on file %s.", key, filepath);
+}
+
+void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath)
+{
+ if (!strcmp(key, "onehot")) cfg->onehot_keys = config_read_values(&cfg->n_onehot_keys, value, &strtok_ptr);
+ else die("util_load_config() Error: Invalid parameter '%s' in [preprocess] section on file %s", key, filepath);
+}
+
+
+void load_categorical_cfgs(
+ struct Configs *cfg,
+ char *key, char *value,
+ char *strtok_ptr)
+{
+ size_t size, *value_size;
+
+ size = cfg->n_categorical_keys;
+ if (cfg->n_categorical_keys == 0) {
+ cfg->categorical_keys = ecalloc(1, sizeof(char *));
+ cfg->categorical_values = ecalloc(1, sizeof(char **));
+ cfg->n_categorical_values = ecalloc(1, sizeof(size_t));
+ cfg->n_categorical_keys++;
+ } else {
+ cfg->categorical_keys = erealloc(cfg->categorical_keys, sizeof(char *) * (size + 1));
+ cfg->categorical_values = erealloc(cfg->categorical_values, sizeof(char *) * (size + 1));
+ cfg->n_categorical_values = erealloc(cfg->n_categorical_values, sizeof(size_t) * (size + 1));
+ cfg->n_categorical_keys++;
+ }
+
+ value_size = cfg->n_categorical_values + size;
+ cfg->categorical_keys[size] = e_strdup(key);
+ cfg->categorical_values[size] = config_read_values(value_size, value, &strtok_ptr);
+ qsort(cfg->categorical_values[size], *value_size, sizeof(char *), cmpstringp);
}
char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr)
@@ -348,4 +460,28 @@ char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_
}
return out_keys;
}
+
+int util_get_key_index(char *key, char **keys, size_t n_keys)
+{
+ int i;
+ for (i = 0; (size_t)i < n_keys; i++)
+ if (!strcmp(key, keys[i])) return i;
+ return -1;
+}
+
+int util_argmax(double *values, size_t n_values)
+{
+ double value = values[0];
+ size_t i, j;
+ for (i = j = 0; i < n_values; i++) {
+ if (values[i] > value) j = i;
+ value = values[i];
+ }
+ return j;
+}
+
+int cmpstringp(const void *p1, const void *p2)
+{
+ return strcmp(*(const char **) p1, *(const char **) p2);
+}
#undef BUFFER_SIZE
Feel free to download, copy and edit any repo