代码之家  ›  专栏  ›  技术社区  ›  Navaneeth K N

标记字符串的有效方法-C

  •  6
  • Navaneeth K N  · 技术社区  · 14 年前

    特里亚 . 每个标记都知道它有子标记。一个简单的tokens表看起来像,

    pattern    value         has_children
    --------   ------        --------
    s          s-val         1
    stack      stack-val     0
    over       over-val      1
    overflow   overflow-val  0
    

    stack 是的孩子 s overflow over . 实际上,此表将以这种方式订购5000多条记录。

    现在,给一个字符串 stackover stack-valover-val . 算法是贪婪的,它总是试图找到最长的匹配。

    为此,我将开始从输入中读取每个字符,查找匹配项,如果找到匹配项并且令牌有子项,则通过包含下一个字符来再次查找匹配项。直到我们找到最长的匹配。如果找不到匹配项,请尝试通过包含下一个字符进行匹配,直到到达字符串末尾或匹配成功为止。

    如果到达字符串末尾时没有匹配项,则输出 ? 符号并从输入中删除第一个字符。用剩下的字符重复整个过程。

    这个算法可以工作,但是回溯和迭代所有可能的输入组合会使它变得缓慢和复杂。

    我想知道有没有更好的方法来解决这个问题?任何帮助都将不胜感激。

    5 回复  |  直到 14 年前
        1
  •  2
  •   Dialecticus    14 年前

    不需要回溯,您可以将所有可能的结果保存在内存中,直到一个结果在输入流的某个点上被单挑出来。例子

    标记:S STACK STACK OVERFLOW STAG OVER OVERFLOW

    1-在位置0上找到S,具有以S开头的标记,请全部尝试,只有S有效,因此 解决方案

    3-T在2上,没有这样的标记,所以S现在可以解析,但是我们也有更长的标记(堆栈),所以S不好。沟渠和烟囱只剩下了,但它有孩子。给孩子们试试绳子。不可能有孩子 解析堆栈
    解决

    最终结果:S叠加超过fun

        2
  •  2
  •   GWW    14 年前

    你能用一下 Aho-Corasick 算法?它创建了一个搜索关键字树(trie)的自动机。

        3
  •  2
  •   mjhm    14 年前

    0 stack      1
    1 s          0
    2 overflow   3
    3 over       5
    4 ovum       5
    5 o          0
    6 exchange   7
    7 ex         0
    

    此列表的第三列是指向父令牌的指针,父令牌始终位于列表的下方。然后,您可以将目标字符串和二进制搜索放在列表中合适的位置。如果它落在匹配的标记上,那么您将剪掉该部分并对其余部分重复该过程。如果不匹配,则使用父指针查找下一个最长的潜在匹配标记。

    如果你真的想变得更漂亮,你也可以把字符串分成64位的单词,在二进制搜索中一次比较8个字符。

        4
  •  1
  •   Hasturkun    14 年前

    我建议你试试 Ragel Ragel user guide 更多信息。

    我已经创建了一个很小的测试,我认为它符合您的规范,这只是状态机描述,没有输入的代码:

    %%{
    machine test;
    
    main := |*
    's' => { puts("s-val");};
    'stack' => { puts("stack-val");};
    'over' => { puts("over-val");};
    'overflow' => { puts("overflow-val");};
    
    # Anything else matches to any, outputs a '?' and continues
    any => {putc('?');};
    *|;
    }%%
    
        5
  •  0
  •   Neopallium    14 年前

    The following token_tree code is based on the prefix_tree class from ZeroMQ

    prefix_tree类只在树的一个前缀与输入文本的开头匹配时返回“true”。它甚至不会告诉你哪个前缀或者这个前缀有多长。

    函数token_tree_longest_token()只需要返回匹配的最长令牌的长度

    基本算法与问题中描述的算法类似,但实现起来可能更快。

    还有一些方法可以提高内存使用率,这样可以更快地使用内存。

    #include <stdint.h>
    #include <stdlib.h>
    
    /* #define TEST_TOKEN_TREE */
    /*
     * TODO: possible improvements, use multiple types of nodes: string/branch/leaf.
     * The string node would replace a chain of normal token_nodes and save memory.
     * This would require spliting a node to add branch points.
     * Use these structs:
     * struct token_node {
     *   uint32_t ref_count;
     *   uint8_t  node_type; -- node is token_node_str/token_node_branch/token_node_leaf
     * };
     * struct token_node_str {
     *   token_node base;
     *   uint8_t    reserved;
     *   uint16_t   len;          -- string length
     *   token_node *child;       -- string nodes can only have one child.
     *   uint8_t    str[0];       -- embedded string (not null-terminated)
     * };
     * struct token_node_branch {
     *   token_node base;
     *   uint8_t  min;            -- smallest char in child list.
     *   uint16_t count;          -- child count.
     *   token_node *children[0];
     * };
     * struct token_node_leaf { -- leaf nodes have no children.
     *   token_node base;
     * };
     * This will save memory, but will make code much more complex.
     */
    
    typedef struct token_tree token_tree;
    typedef struct token_node token_node;
    
    struct token_tree {
        token_node *root; /**< root node of token tree. */
    };
    
    struct token_node {
        uint32_t   ref_count;    /**< how many token references end at this node. */
        uint8_t    min;          /**< smallest 'char' in children's list. */
        uint8_t    reserved;     /**< padding. */
        uint16_t   count;        /**< number of children. (max count = 256, so count must be 16bits) */
        token_node *children[0]; /**< list of children nodes.  index by (c - min) */
    };
    
    #define NODE_SIZE(count) (sizeof(token_node) + (sizeof(token_node *) * count))
    
    static token_node *token_node_new(uint16_t count) {
        token_node *node = calloc(1, NODE_SIZE(count));
        node->count = count;
        return node;
    }
    
    static void token_node_build_chain(token_node **pnode, const uint8_t *token, size_t len) {
        token_node *node;
        do {
            /* the last node in the chain will have no children. */
            node = token_node_new((len == 0) ? 0 : 1);
            *pnode = node; /* add node to slot in parent's children list. */
            if(len == 0) break;
            /* new node will have one child. */
            node->min = *token;
            node->count = 1;
            /* slot where next node will be saved. */
            pnode = &(node->children[0]);
            /* consume char. */
            token++;
            len--;
        } while(1);
        /* mark last node as end of a valid token. */
        node->ref_count++;
    }
    
    static void token_node_free(token_node *node) {
        uint32_t i;
        uint32_t count = node->count;
    
        /* free children nodes. */
        for(i=0; i < count; i++) {
            if(node->children[i]) token_node_free(node->children[i]);
        }
        free(node);
    }
    
    static void token_node_grow(token_node **pnode, uint8_t c) {
        token_node *node = *pnode;
        token_node **children;
        uint8_t  old_min = node->min;
        uint16_t old_count = node->count;
        uint32_t i;
        uint8_t  min;
        uint16_t count;
    
        if(c < old_min) {
            min = c;
            count = old_count + (old_min - min);
        } else {
            if(old_count == 0) {
                /* the list was empty, so this is the first char. */
                old_min = c;
            }
            min = old_min;
            c -= old_min;
            if(c < old_count) {
                /* don't need to grow. */
                return;
            }
            count = c + 1;
        }
    
        node = realloc(node, NODE_SIZE(count));
        *pnode = node;
        children = node->children;
        /* if the 'min' value changed, then we need to move all the old slots up. */
        if(old_min != min) {
            uint32_t diff = old_min - min;
            for(i=count-1; i >= diff; i--) {
                children[i] = children[i - diff];
            }
            /* null new slots at start of children list. */
            for(i=0; i < diff; i++) {
                children[i] = NULL;
            }
        } else {
            /* null new slots at end of children list. */
            for(i=old_count; i < count; i++) {
                children[i] = NULL;
            }
        }
        node->min = min;
        node->count = count;
    }
    
    static token_node **token_node_find_last_node(token_node **pnode, const uint8_t **ptoken, size_t *plen) {
        const uint8_t *token = *ptoken;
        size_t len = *plen;
        uint32_t c;
        token_node *node = *pnode;
    
        while(node && len) {
            /* next char. */
            c = (*token);
            /* if c < node->min, then it will underflow and be > node->count. */
            c -= node->min;
            /* make sure c is in range. */
            if(c >= node->count) {
                /*
                 * NOTE: we don't consume this char and "*pnode" will not be null.
                 * When adding tokens, this node will be grown to hold more children.
                 */
                break;
            }
    
            /* consume char. */
            token++;
            len--;
            /* get pointer to next node's slot. */
            pnode = &(node->children[c]);
            node = *pnode;
        }
    
        *ptoken = token;
        *plen = len;
        /* return pointer to last node's slot. */
        return pnode;
    }
    
    static void token_node_add(token_node **pnode, const uint8_t *token, size_t len) {
        token_node *node;
    
        /* find last node in chain for this token. */
        pnode = token_node_find_last_node(pnode, &token, &len);
    
        /* if full token was consumed then we found the last node for this token. */
        if(!len) {
            node = *pnode;
            node->ref_count++;
            return;
        }
    
        /* check if the children list of the last node needs to be grown. */
        node = *pnode;
        if(node) {
            uint32_t c = *token;
            /* consume char. */
            token++;
            len--;
            /* grow node to make room for new char. */
            token_node_grow(pnode, c);
            node = *pnode; /* token_node_grow() may change the node's pointer. */
            /* get slot for new child. */
            pnode = &(node->children[c - node->min]);
        }
        /* build node chain for un-consumed part of token. */
        token_node_build_chain(pnode, token, len);
    }
    
    static size_t token_node_longest_token(token_node *node, const uint8_t *text, size_t len) {
        size_t last_token_len = 0;
        size_t off = 0;
        uint32_t c;
    
        /* loop until we get a NULL node or run out of text. */
        do {
            if(node->ref_count > 0) {
                /* found a token, keep track of it's length. */
                last_token_len = off;
            }
            /* end of input text. */
            if(off >= len) break;
            /* next char. */
            c = text[off];
            /* if c < node->min, then it will underflow and be > node->count. */
            c -= node->min;
            /* make sure c is in range. */
            if(c >= node->count) {
                /* End of search, no more child nodes. */
                break;
            }
    
            /* consume char. */
            off++;
            /* get pointer to next node's slot. */
            node = node->children[c];
        } while(node);
    
        /* return length of largest token found. */
        return last_token_len;
    }
    
    extern token_tree *token_tree_new() {
        token_tree *tree = malloc(sizeof(token_tree));
        tree->root = token_node_new(0);
        return tree;
    }
    
    extern void token_tree_free(token_tree *tree) {
        token_node_free(tree->root);
        free(tree);
    }
    
    extern void token_tree_add(token_tree *tree, const char *token, size_t len) {
        token_node_add(&(tree->root), token, len);
    }
    
    extern size_t token_tree_longest_token(token_tree *tree, const char *text, size_t len) {
        return token_node_longest_token(tree->root, text, len);
    }
    
    #ifdef TEST_TOKEN_TREE
    #include <stdio.h>
    #include <string.h>
    
    static const char *test_tokens[] = {
        "s",
        "stack",
        "stackoverflow",
        "over",
        "overflow",
        NULL,
    };
    
    static const char *test_input[] = {
        "aastackoverasdfasdf",
        "stack7777",
        "777stack777",
        "overstackflow",
        NULL,
    };
    
    static void add_tokens(token_tree *tree, const char **tokens) {
        int i;
        for(i = 0; tokens[i] != NULL; i++) {
            token_tree_add(tree, tokens[i], strlen(tokens[i]));
        }
    }
    
    static void print_tokens(token_tree *tree, const char *text) {
        size_t len = strlen(text);
        size_t token_len;
    
        printf("input: \"%s\"\n", text);
        printf("tokens: [");
        while(len) {
            token_len = token_tree_longest_token(tree, text, len);
            if(token_len > 0) {
                printf("<%.*s>", (int)token_len, text);
            } else {
                printf("?");
                token_len = 1;
            }
            text += token_len;
            len -= token_len;
        }
        printf("]\n");
    }
    
    static void run_test(token_tree *tree, const char **texts) {
        int i;
        for(i = 0; texts[i] != NULL; i++) {
            print_tokens(tree, texts[i]);
        }
    }
    
    int main(int argc, char *argv[]) {
        token_tree *tree = token_tree_new();
    
        add_tokens(tree, test_tokens);
    
        run_test(tree, test_input);
        run_test(tree, test_tokens);
    
        token_tree_free(tree);
    }
    
    #endif