class WordScoop

Constants

VERSION

Attributes

Public Class Methods

new(*args) click to toggle source

// word_scoop.c // // Tsukasa OISHI // // 2009/11/21 //************************************

include <stdio.h> include <stdlib.h> include <string.h> include <stdbool.h> include <ruby.h> include <ruby/encoding.h> include “word_scoop.h”

static VALUE t_add(VALUE, VALUE);

// initialize node static node initialize_node(char moji) {

node work = (node)malloc(sizeof(struct _node));
if (!work) {
    // except
    rb_raise(rb_eStandardError, "memory is not enough");
}

work->moji = moji;
work->end_flag = false;
work->child_head = NULL;
work->next = NULL;

return work;

}

// add child node static void add_child(node parent, node child) {

if (parent->child_head) {
    child->next = parent->child_head;
}
parent->child_head = child;

}

// search node by use character static node search_child(node n, char moji) {

node child;

child = n->child_head;
while(child) {
    if (child->moji == moji) {
        break;
    }
    child = child->next;
}

return child;

}

// search node by use character. // if nothing, create new node static node search_child_or_create(node n, char moji) {

node child;

child = search_child(n, moji);
if(!child) {
    child = initialize_node(moji);
    add_child(n, child);
}

return child;

}

// free memory all child and self static void destroy_node(node n) {

node now, next;

now = n->child_head;
while(now) {
    next = now->next;
    destroy_node(now);
    now = next;
}

free(n);

}

// add encoding info static VALUE add_encode(VALUE str, rb_encoding *enc) {

rb_enc_associate(str, enc);
return str;

}

//———————————————————– // Ruby Methods // ———————————————————-

/**

new
static VALUE
t_new(int argc, VALUE *argv, VALUE klass)
{
    node root;
    VALUE obj, array, string;

    root = initialize_node(NULL_CHAR);

    obj = Data_Make_Struct(klass, struct _node, NULL, destroy_node, root);

    if (argc == 1) {
        array = argv[0];
        while((string = rb_ary_shift(argv[0])) != Qnil) {
            t_add(obj, string);
        }
    }

    return obj;
}

Public Instance Methods

<<(p1)

add

Alias for: add
add(p1) click to toggle source

add

static VALUE
t_add(VALUE self, VALUE str)
{
    node root, now;
    char *keyword;
    long i, len;

    keyword = StringValuePtr(str);

    len = strlen(keyword);
    while(keyword[len - 1] == CR || keyword[len - 1] == LF ||
            keyword[len - 1] == TAB ||  keyword[len - 1] == SPACE) {
        len--;
    }

    if (len < 1) {
        return Qfalse;
    }

    Data_Get_Struct(self, struct _node, root);
    now = root;

    for(i = 0; i < len; i++) {
        now = search_child_or_create(now, keyword[i]);
    }

    now->end_flag = true;

    return str;
}
Also aliased as: <<
filter_html(p1) click to toggle source

filter_html

static VALUE
t_filter_html(VALUE self, VALUE str)
{
    node root, now, ret;
    bool in_tag;
    char *text;
    const char* inner_tag;
    long i, head_i, tail_i, copy_head_i, total_len;
    VALUE change_str, url_base, word;
    rb_encoding *enc;

    change_str = rb_str_new2(EMPTY_STRING);
    enc = rb_enc_get(str);
    text = StringValuePtr(str);

    Data_Get_Struct(self, struct _node, root);
    url_base = rb_iv_get(self, LINK_URL_VARIABLE);
    if (url_base == Qnil) {
        url_base = rb_str_new2(DEAULT_LINK_URL);
    }

    now = root;
    total_len = strlen(text);
    head_i = -1;
    tail_i = -1;
    copy_head_i = 0;
    in_tag = false;
    inner_tag = NULL;

    for(i = 0; i <= total_len; i++) {
        if (!in_tag && text[i] == BEGIN_TAG) {
            in_tag = true;
            if (strncasecmp(&text[i + 1], A_TAG, strlen(A_TAG)) == 0) {
                inner_tag = A_TAG;
            } else if (strncasecmp(&text[i + 1], SCRIPT_TAG, strlen(SCRIPT_TAG)) == 0) {
                inner_tag = SCRIPT_TAG;
            } else if (strncasecmp(&text[i + 1], PRE_TAG, strlen(PRE_TAG)) == 0) {
                inner_tag = PRE_TAG;
            } else if (strncasecmp(&text[i + 1], IFRAME_TAG, strlen(IFRAME_TAG)) == 0) {
                inner_tag = IFRAME_TAG;
            } else if (strncasecmp(&text[i + 1], OBJECT_TAG, strlen(OBJECT_TAG)) == 0) {
                inner_tag = OBJECT_TAG;
            }
            continue;
        }

        if (in_tag && !inner_tag && text[i] == END_TAG) {
            in_tag = false;
            continue;
        }

        if (inner_tag && text[i] == BEGIN_TAG) {
            if (strncasecmp(&text[i + 2], inner_tag, strlen(inner_tag)) == 0) {
                inner_tag = NULL;
                continue;
            }
        }

        if (in_tag) {
            continue;
        }

        ret = search_child(now, text[i]);

        if (ret && i != total_len) {
            if (head_i == -1) {
                head_i = i;
            }

            if (ret->end_flag) {
                tail_i = i;
            }
            now = ret;
        } else {
            if (head_i != -1) {
                if (tail_i != -1) {
                    if (copy_head_i < head_i) {
                        rb_funcall(
                            change_str, 
                            rb_intern("concat"),
                            1,
                            add_encode(rb_str_new(&text[copy_head_i], (head_i - copy_head_i)), enc)
                        );
                    }

                    word = rb_str_new(&text[head_i], (tail_i - head_i + 1));
                    rb_funcall(
                        change_str,
                        rb_intern("concat"),
                        1,
                        add_encode(rb_funcall(url_base, rb_intern("%"), 1, rb_assoc_new(word, word)), enc)
                    );
                    i = tail_i;
                    copy_head_i = tail_i + 1;
                    tail_i = -1;
                } else {
                    i = head_i;
                }
                head_i = -1;
            }
            now = root;
        }
    }

    if (copy_head_i == 0) {
        return str;
    } else {
        rb_funcall(
            change_str,
            rb_intern("concat"),
            1,
            add_encode(rb_str_new(&text[copy_head_i], (total_len - copy_head_i)), enc)
        );
        return change_str;
    }
}