/** * ml - a neural network processor written with C * Copyright (C) 2023 jvech * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include "util.h" #define BUFFER_SIZE 1024 static int cmpstringp(const void *, const void *); static char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr); static void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath); static void load_lyr_cfgs(struct Configs *cfg, char *key, char *value, char *filepath); static void load_categorical_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr); static void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath); static void add_lyr(struct Configs *cfg); void die(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); if (fmt[0] && fmt[strlen(fmt)-1] == ':') { fputc(' ', stderr); perror(NULL); } else { fputc('\n', stderr); } exit(1); } void * erealloc(void *ptr, size_t size) { void *p; if (!(p = realloc(ptr, size))) die("realloc:"); return p; } void * ecalloc(size_t nmemb, size_t size) { void *p; if (!(p = calloc(nmemb, size))) die("calloc:"); return p; } char *e_strdup(const char *s) { char *out = strdup(s); if (out == NULL) die("strdup() Error:"); return out; } void version() { printf("ml 0.5.0\n"); printf( "Copyright (C) 2023 jvech\n\n" "This program is free software: you can redistribute it and/or modify\n" "it under the terms of the GNU General Public License as published by\n" "the Free Software Foundation, either version 3 of the License, or\n" "(at your option) any later version.\n\n" ); printf("Written by jvech\n"); exit(0); } void usage(int exit_code) { FILE *fp = (!exit_code) ? stdout : stderr; fprintf(fp, "Usage: ml [re]train [Options] FILE\n" " or: ml predict [-Ohv] [-f FORMAT] [-o FILE] [-p INT] FILE\n" "\n" "Options:\n" " -h, --help Show this message\n" " -f, --format=FORMAT Define input or output FILE format if needed\n" " -O, --only-out Don't show input fields (only works with predict)\n" " -a, --alpha=ALPHA Learning rate (only works with train)\n" " -b, --batch=INT Select batch size [default: 32] (only works with train)\n" " -c, --config=FILE Configuration filepath [default=~/.config/ml/ml.cfg]\n" " -e, --epochs=EPOCHS Epochs to train the model (only works with train)\n" " -o, --output=FILE Output file (only works with predict)\n" " -p, --precision=INT Decimals output precision (only works with predict)\n" " [default=auto]\n" " -S, --no-shuffle Don't shuffle data each epoch (only works with train)\n" "\n" ); exit(exit_code); } void util_load_cli(struct Configs *ml, int argc, char *argv[]) { if (argc <= 1) usage(1); static struct option long_opts[] = { {"help", no_argument, 0, 'h'}, {"version", no_argument, 0, 'v'}, {"format", required_argument, 0, 'f'}, {"epochs", required_argument, 0, 'e'}, {"batch", required_argument, 0, 'b'}, {"alpha", required_argument, 0, 'a'}, {"no-shuffle", no_argument, 0, 'S'}, {"output", required_argument, 0, 'o'}, {"config", required_argument, 0, 'c'}, {"only-out", no_argument, 0, 'O'}, {"precision", required_argument, 0, 'p'}, {0, 0, 0, 0 }, }; int c; while (1) { c = getopt_long(argc, argv, "hvOSc:e:a:o:i:f:p:b:", long_opts, NULL); if (c == -1) { break; } switch (c) { case 'e': ml->epochs = (size_t)atol(optarg); break; case 'a': ml->alpha = (double)atof(optarg); break; case 'o': ml->out_filepath = optarg; break; case 'c': ml->config_filepath = optarg; break; case 'f': ml->file_format = optarg; break; case 'O': ml->only_out = true; break; case 'p': ml->decimal_precision = (!strcmp("auto", optarg))? -1: (int)atoi(optarg); break; case 'b': if (atoi(optarg) <= 0) die("util_load_cli() Error: batch size must be greater than 0"); ml->batch_size = (size_t)atol(optarg); break; case 'S': ml->shuffle = false; break; case 'h': usage(0); break; case 'v': version(); break; default: usage(1); break; } } argv += optind; argc -= optind; if (argc != 2) usage(1); ml->in_filepath = argv[1]; } void util_free_config(struct Configs *ml) { if (ml->loss != NULL) free(ml->loss); if (ml->neurons != NULL) free(ml->neurons); if (ml->weights_filepath != NULL) free(ml->weights_filepath); if (ml->input_keys != NULL) { for (size_t i = 0; i < ml->n_input_keys; i++) free(ml->input_keys[i]); free(ml->input_keys); } if (ml->label_keys != NULL) { for (size_t i = 0; i < ml->n_label_keys; i++) free(ml->label_keys[i]); free(ml->label_keys); } if (ml->activations != NULL) { for (size_t i = 0; i < ml->network_size; i++) free(ml->activations[i]); free(ml->activations); } if (ml->onehot_keys != NULL) { for (size_t i = 0; i < ml->n_onehot_keys; i++) free(ml->onehot_keys[i]); free(ml->onehot_keys); } if (ml->categorical_keys != NULL) { for (size_t i = 0; i < ml->n_categorical_keys; i++) free(ml->categorical_keys[i]); free(ml->categorical_keys); } if (ml->categorical_values != NULL) { for (size_t i = 0; i < ml->n_categorical_keys; i++) { for (size_t j = 0; j < ml->n_categorical_values[i]; j++) { free(ml->categorical_values[i][j]); } free(ml->categorical_values[i]); } free(ml->n_categorical_values); free(ml->categorical_values); } } void util_load_config(struct Configs *ml, char *filepath) { enum Section {NET, PREPROCESSING, CATEGORICAL, LAYER, OUT_LAYER}; enum Section section; int line_number = 0; char line_buffer[BUFFER_SIZE], line_buffer_original[BUFFER_SIZE]; char token_buffer[BUFFER_SIZE]; FILE *fp = fopen(filepath, "r"); if (fp == NULL) return; while (fgets(line_buffer, BUFFER_SIZE, fp)) { int ret = sscanf(line_buffer, "[%[-_a-zA-Z0-9]]", token_buffer); line_number++; if (ret >= 1){ if (!strcmp("net", token_buffer)) { section = NET; } else if (!strcmp("layer", token_buffer)) { section = LAYER; ml->network_size++; add_lyr(ml); } else if (!strcmp("outlayer", token_buffer)) { section = OUT_LAYER; ml->network_size++; add_lyr(ml); ml->neurons[ml->network_size-1] = ml->n_label_keys; } else if (!strcmp("preprocessing", token_buffer)) { section = PREPROCESSING; } else if (!strcmp("categorical_fields", token_buffer)) { section = CATEGORICAL; } else { die("util_load_config() Error: Unknown section '%s' on %s", line_buffer, filepath); } continue; } sscanf(line_buffer, "%1023[^\n]", line_buffer_original); char *line_ptr = line_buffer; while (*line_ptr == ' ') line_ptr++; // delete whitespaces /* if the line start with comments or is a blank line ignore it */ if (*line_ptr == ';' || *line_ptr == '#' || *line_ptr == '\n') continue; /* Verify that each line starts with [a-zA-Z] */ if ((*line_ptr < 0x41 && *line_ptr > 0x5A) || (*line_ptr < 0x61 && *line_ptr > 0x7A)) goto util_load_config_error; char *ptr_buffer; strtok_r(line_buffer, ";#", &ptr_buffer); // omit comments /* Check For invalid = characters*/ int eq_count; for (eq_count = 0, line_ptr = line_buffer; *line_ptr != '\0'; line_ptr++, eq_count += (*line_ptr == '=')); if (eq_count > 1) goto util_load_config_error; /* Load Key Value */ char *key, *value; key = strtok_r(line_buffer, " =", &ptr_buffer); value = strtok_r(NULL, "= ,\n", &ptr_buffer); if (value == NULL) goto util_load_config_error; switch (section) { case NET: load_net_cfgs(ml, key, value, ptr_buffer, filepath); break; case PREPROCESSING: load_preprocess_cfgs(ml, key, value, ptr_buffer, filepath); break; case CATEGORICAL: load_categorical_cfgs(ml, key, value, ptr_buffer); break; case LAYER: load_lyr_cfgs(ml, key, value, filepath); break; case OUT_LAYER: load_lyr_cfgs(ml, key, value, filepath); if (!strcmp("neurons", key) && (size_t)atol(value) != ml->n_label_keys) { die("util_load_config() Error: out layer neurons (%zu) differ from the number of labels (%zu)", (size_t)atol(value), ml->n_label_keys); } break; default: goto util_load_config_error; break; } } /* Checks categorical_keys in label_keys or input_keys or onehot_keys*/ size_t i,j,k; for (i = 0; i < ml->n_categorical_keys; i++) { int ret; ret = util_get_key_index(ml->categorical_keys[i], ml->input_keys, ml->n_input_keys); if (ret >= 0) continue; ret = util_get_key_index(ml->categorical_keys[i], ml->label_keys, ml->n_label_keys); if (ret == -1) { die("util_load_config() Error: field '%s' does not exist", ml->categorical_keys[i]); } ret = util_get_key_index(ml->categorical_keys[i], ml->onehot_keys, ml->n_onehot_keys); if (ret >= 0) continue; die("util_load_config() Error: field '%s' must be encoded", ml->categorical_keys[i]); } /* Check onehot_keys in categorical_keys */ for (i = 0; i < ml->n_onehot_keys; i++) { int ret = util_get_key_index(ml->onehot_keys[i], ml->categorical_keys, ml->n_categorical_keys); if (ret >= 0) continue; die("util_load_config() Error: one hot field '%s' is not defined as categorical", ml->onehot_keys[i]); } /* Determine out layer neurons */ size_t *out_layer_neurons = ml->neurons + ml->network_size - 1; *out_layer_neurons = 0; for (i = 0; i < ml->n_label_keys; i++) { int ret = 1; for (j = 0; ret && j < ml->n_categorical_keys; j++) ret = strcmp(ml->categorical_keys[j], ml->label_keys[i]); for (k = 0; ret && k < ml->n_onehot_keys; k++) ret = strcmp(ml->onehot_keys[k], ml->label_keys[i]); *out_layer_neurons += (!ret) ? ml->n_categorical_values[i] : 1; } fclose(fp); return; util_load_config_error: die("util_load_config() Error: Invalid format on %s.\n %d: %s", filepath, line_number, line_buffer_original); } void add_lyr(struct Configs *cfg) { if (cfg->network_size == 1) { cfg->activations = ecalloc(1, sizeof(char *)); cfg->neurons = ecalloc(1, sizeof(size_t)); return; } cfg->activations = erealloc(cfg->activations, cfg->network_size * sizeof(char *)); cfg->neurons = erealloc(cfg->neurons, cfg->network_size * sizeof(size_t)); } void load_lyr_cfgs(struct Configs *cfg, char *key, char *value, char *filepath) { size_t index = cfg->network_size - 1; if (index > cfg->network_size) die("load_lyr_cfgs() Error: index '%d' is greater than network_size '%d'", index, cfg->network_size); if (!strcmp(key, "activation")) cfg->activations[index] = strdup(value); else if (!strcmp(key, "neurons")) cfg->neurons[index] = atof(value); else die("util_load_config() Error: Unknown parameter '%s' on file %s.", key, filepath); } void load_net_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath) { if (!strcmp(key, "weights_path")) cfg->weights_filepath = e_strdup(value); else if (!strcmp(key, "loss")) cfg->loss = e_strdup(value); else if (!strcmp(key, "epochs")) cfg->epochs = (size_t)atol(value); else if (!strcmp(key, "batch")) cfg->batch_size = (size_t)atol(value); else if (!strcmp(key, "alpha")) cfg->alpha = (double)atof(value); else if (!strcmp(key, "inputs")) cfg->input_keys = config_read_values(&(cfg->n_input_keys), value, &strtok_ptr); else if (!strcmp(key, "labels")) cfg->label_keys = config_read_values(&(cfg->n_label_keys), value, &strtok_ptr); else die("util_load_config() Error: Invalid parameter '%s' in [net] section on file %s.", key, filepath); } void load_preprocess_cfgs(struct Configs *cfg, char *key, char *value, char *strtok_ptr, char *filepath) { if (!strcmp(key, "onehot")) cfg->onehot_keys = config_read_values(&cfg->n_onehot_keys, value, &strtok_ptr); else die("util_load_config() Error: Invalid parameter '%s' in [preprocess] section on file %s", key, filepath); } void load_categorical_cfgs( struct Configs *cfg, char *key, char *value, char *strtok_ptr) { size_t size, *value_size; size = cfg->n_categorical_keys; if (cfg->n_categorical_keys == 0) { cfg->categorical_keys = ecalloc(1, sizeof(char *)); cfg->categorical_values = ecalloc(1, sizeof(char **)); cfg->n_categorical_values = ecalloc(1, sizeof(size_t)); cfg->n_categorical_keys++; } else { cfg->categorical_keys = erealloc(cfg->categorical_keys, sizeof(char *) * (size + 1)); cfg->categorical_values = erealloc(cfg->categorical_values, sizeof(char *) * (size + 1)); cfg->n_categorical_values = erealloc(cfg->n_categorical_values, sizeof(size_t) * (size + 1)); cfg->n_categorical_keys++; } value_size = cfg->n_categorical_values + size; cfg->categorical_keys[size] = e_strdup(key); cfg->categorical_values[size] = config_read_values(value_size, value, &strtok_ptr); qsort(cfg->categorical_values[size], *value_size, sizeof(char *), cmpstringp); } char ** config_read_values(size_t *n_out_keys, char *first_value, char **strtok_ptr) { *n_out_keys = 1; char **out_keys = ecalloc(1, sizeof(char *)); out_keys[0] = e_strdup(first_value); char *value; while ((value = strtok_r(NULL, ", \n", strtok_ptr)) != NULL) { out_keys = erealloc(out_keys, sizeof(char *) * (*n_out_keys + 1)); out_keys[*n_out_keys] = e_strdup(value); (*n_out_keys)++; } return out_keys; } int util_get_key_index(char *key, char **keys, size_t n_keys) { int i; for (i = 0; (size_t)i < n_keys; i++) if (!strcmp(key, keys[i])) return i; return -1; } int util_argmax(double *values, size_t n_values) { double value = values[0]; size_t i, j; for (i = j = 0; i < n_values; i++) { if (values[i] > value) j = i; value = values[i]; } return j; } int cmpstringp(const void *p1, const void *p2) { return strcmp(*(const char **) p1, *(const char **) p2); } #undef BUFFER_SIZE