* probability of the transposition from C to I. It's simply much more
* cleaner, and direct to seperate the two factors.
*
+ * Research tells us that 80% to 95% of all spelling errors have an edit
+ * distance no greater than one. Knowing this we can optimize for most
+ * cases of mistakes without taking a performance hit. Which is what we
+ * base longer edit distances off of. Opposed to the original method of
+ * I had concieved of checking everything.
+ *
* A little information on additional algorithms used:
- *
+ *
* Initially when I implemented this corrector, it was very slow.
* Need I remind you this is essentially a brute force attack on strings,
* and since every transformation requires dynamic memory allocations,
* very slow. To combat this I had suggested burst tries to Blub.
* The next day he had implemented them. Sure enough this brought
* down the runtime by a factory > 100%
+ *
+ * Future Work (If we really need it)
+ *
+ * Currently we can only distinguishes one source of error in the
+ * language model we use. This could become an issue for identifiers
+ * that have close colliding rates, e.g colate->coat yields collate.
+ *
+ * Currently the error model has been fairly trivial, the smaller the
+ * edit distance the smaller the error. This usually causes some un-
+ * expected problems. e.g reciet->recite yields recipt. For QuakeC
+ * this could become a problem when lots of identifiers are involved.
+ *
+ * Our control mechanisim could use a limit, i.e limit the number of
+ * sets of edits for distance X. This would also increase execution
+ * speed considerably.
*/
void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
const unsigned char *data = (const unsigned char*)key;
while (*data) {
- unsigned char ch = *data;
- correct_trie_t *entries = t->entries;
- const size_t vs = vec_size(t->entries);
- size_t i;
+ const size_t vs = vec_size(t->entries);
+ unsigned char ch = *data;
+ correct_trie_t *entries = t->entries;
+ size_t i;
+
for (i = 0; i < vs; ++i) {
if (entries[i].ch == ch) {
t = &entries[i];
}
if (i == vs) {
correct_trie_t *elem = (correct_trie_t*)vec_add(t->entries, 1);
+
elem->ch = ch;
elem->value = NULL;
elem->entries = NULL;
- t = elem;
+ t = elem;
}
++data;
}
}
void correct_del(correct_trie_t* dictonary, size_t **data) {
- size_t i;
+ size_t i;
const size_t vs = vec_size(data);
+
for (i = 0; i < vs; i++)
mem_d(data[i]);
* because they're only valid after the first character is of a _, or
* alpha character.
*/
-static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
+static const char correct_alpha[] = "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "_"; /* TODO: Numbers ... */
/*
* correcting logic for the following forms of transformations:
}
static char **correct_known(correct_trie_t* table, char **array, size_t rows, size_t *next) {
- size_t itr;
- size_t jtr;
- size_t len;
- size_t row;
+ size_t itr = 0;
+ size_t jtr = 0;
+ size_t len = 0;
+ size_t row = 0;
size_t nxt = 8;
char **res = correct_pool_alloc(sizeof(char *) * nxt);
char **end = NULL;
- for (itr = 0, len = 0; itr < rows; itr++) {
+ for (; itr < rows; itr++) {
end = correct_edit(array[itr]);
row = correct_size(array[itr]);
- for (jtr = 0; jtr < row; jtr++) {
+ for (; jtr < row; jtr++) {
if (correct_find(table, end[jtr]) && !correct_exist(res, len, end[jtr])) {
res = correct_known_resize(res, &nxt, len+1);
res[len++] = end[jtr];
}
static char *correct_maximum(correct_trie_t* table, char **array, size_t rows) {
- char *str = NULL;
- size_t *itm = NULL;
- size_t itr;
- size_t top;
+ char *str = NULL;
+ size_t *itm = NULL;
+ size_t itr = 0;
+ size_t top = 0;
- for (itr = 0, top = 0; itr < rows; itr++) {
+ for (; itr < rows; itr++) {
if ((itm = correct_find(table, array[itr])) && (*itm > top)) {
top = *itm;
str = array[itr];
*
* the add function works the same. Except the identifier is used to
* add to the dictonary.
- */
-
+ */
char *correct_str(correct_trie_t* table, const char *ident) {
char **e1;
char **e2;