class Kdtree

Kdtree is an insanely fast data structure for finding the nearest
neighbor(s) to a given point. This implementation only supports 2d
points. Also, it only supports static points - there is no way to edit the
tree after it has been initialized. Kdtree should scale to millions of
points, though it's only been tested with around 1 million.

Once the tree is constructed, it can be searched with nearest and nearestk.

To avoid the startup costs associated with creating a new tree, use persist
to write the tree to disk. You can then construct the tree later from that
file.

 points = []
 points << [47.6, -122.3, 1] # Seattle
 points << [45.5, -122.8, 2] # Portland
 points << [40.7, -74.0,  3] # New York
 kd = Kdtree.new(points)

 # which city is closest to San Francisco?
 kd.nearest(34.1, -118.2) #=> 2
 # which two cities are closest to San Francisco?
 kd.nearestk(34.1, -118.2, 2) #=> [2, 1]

For more information on kd trees, see:

en.wikipedia.org/wiki/Kd-tree

Public Class Methods

new(points) → kdtree click to toggle source
new(io) → kdtree

Returns a new Kdtree. To construct a tree, pass an array of points. Each point should be an array of the form [x, y, id], where x and y are floats and id is an integer. The id is arbitrary and will be returned to you whenever you search with nearest or nearestk.

# create a new tree
points = []
points << [47.6, -122.3, 1] # Seattle
points << [40.7, -74.0, 2]  # New York
kd = Kdtree.new(points)

Alternately, you can pass in an IO object containing a persisted kdtree. This makes it possible to build the tree in advance, persist it, and start it up quickly later. See persist for more information.

static VALUE kdtree_initialize(VALUE kdtree, VALUE arg)
{
    KDTREEP;

    if (TYPE(arg) == T_ARRAY) {
        // init from array of pints
        VALUE points = arg;
        int i;
        kdtreep->len = RARRAY_LEN(points);
        kdtreep->nodes = ALLOC_N(struct kdtree_node, kdtreep->len);

        for (i = 0; i < RARRAY_LEN(points); ++i) {
            struct kdtree_node *n = kdtreep->nodes + i;

            VALUE ptr = rb_ary_entry(points, i);
            VALUE v = rb_check_array_type(ptr);
            if (NIL_P(v) || RARRAY_LEN(v) != 3) {
                continue;
            }
            n->x = NUM2DBL(rb_ary_entry(v, 0));
            n->y = NUM2DBL(rb_ary_entry(v, 1));
            n->id = NUM2INT(rb_ary_entry(v, 2));
        }

        // now build the tree
        kdtreep->root = kdtree_build(kdtreep, 0, kdtreep->len, 0);
    } else if (rb_respond_to(arg, rb_intern("read"))) {
        VALUE io = arg;
        char buf[4];
        if (rb_respond_to(io, id_binmode)) {
            rb_funcall(io, id_binmode, 0);
        }

        // check magic
        read_all(io, buf, 4);
        if (memcmp(KDTREE_MAGIC, buf, 4) != 0) {
            rb_raise(rb_eRuntimeError, "wrong magic number in kdtree file");
        }

        // read start of the struct
        read_all(io, kdtreep, sizeof(struct kdtree_data) - sizeof(struct kdtree_node *));

        // read the nodes
        kdtreep->nodes = ALLOC_N(struct kdtree_node, kdtreep->len);
        read_all(io, kdtreep->nodes, sizeof(struct kdtree_node) * kdtreep->len);
    } else {
        rb_raise(rb_eTypeError, "array or IO required to init Kdtree");
    }

    return kdtree;
}

Public Instance Methods

nearest(x, y) → id click to toggle source

Finds the point closest to x, y and returns the id for that point. Returns -1 if the tree is empty.

points = []
points << [47.6, -122.3, 1] # Seattle
points << [40.7, -74.0, 2]  # New York
kd = Kdtree.new(points)

# which city is closest to Portland?
kd.nearest(45.5, -122.8) #=> 1
# which city is closest to Boston?
kd.nearest(42.4, -71.1)  #=> 2
static VALUE kdtree_nearest(VALUE kdtree, VALUE x, VALUE y)
{
    int n_index;
    float n_dist;
    KDTREEP;

    n_index = -1;
    n_dist = INT_MAX;

    kdtree_nearest0(kdtreep, kdtreep->root, NUM2DBL(x), NUM2DBL(y), 0, &n_index, &n_dist);
    if (n_index == -1) {
        return -1;
    }
    return INT2NUM((kdtreep->nodes + n_index)->id);
}
nearestk(x, y, k) → array click to toggle source

Finds the k points closest to x, y. Returns an array of ids, sorted by distance. Returns an empty array if the tree is empty. Note that k is capped at 255.

points = []
points << [47.6, -122.3, 1] # Seattle
points << [45.5, -122.8, 2] # Portland
points << [40.7, -74.0,  3] # New York
kd = Kdtree.new(points)

# which two cities are closest to San Francisco?
kd.nearestk(34.1, -118.2, 2) #=> [2, 1]
static VALUE kdtree_nearestk(VALUE kdtree, VALUE x, VALUE y, VALUE k)
{
    // note I leave an extra slot here at the end because of the way our binary insert works
    kresult k_list[MAX_K + 1];
    int k_len = 0;
    float k_dist = INT_MAX;
    int ki = NUM2INT(k);
    VALUE ary;
    int i;
    KDTREEP;

    if (ki < 1) {
        ki = 1;
    } else if (ki > MAX_K) {
        ki = MAX_K;
    }
    kdtree_nearestk0(kdtreep, kdtreep->root, NUM2DBL(x), NUM2DBL(y), ki, 0, k_list, &k_len, &k_dist);

    // convert result to ruby array
    ary = rb_ary_new();
    for (i = 0; i < k_len; ++i) {
        rb_ary_push(ary, INT2NUM(kdtreep->nodes[k_list[i].index].id));
    }
    return ary;
}
persist(io) click to toggle source

Writes the tree out to io so you can quickly load it later with Kdtree.new. This avoids the startup cost of initializing a tree. Apart from a small header, the size of the file is proportional to the number of points, requiring 20 bytes per point.

This file is NOT PORTABLE across different architectures due to endian issues.

points = []
points << [47.6, -122.3, 1] # Seattle
points << [45.5, -122.8, 2] # Portland
points << [40.7, -74.0,  3] # New York
kd = Kdtree.new(points)

# persist the tree to disk
File.open("treefile", "w") { |f| kd.persist(f) }

...

# later, read the tree from disk
kd2 = File.open("treefile") { |f| Kdtree.new(f) }
static VALUE kdtree_persist(VALUE kdtree, VALUE io)
{
    VALUE str;
    KDTREEP;

    if (!rb_respond_to(io, rb_intern("write"))) {
        rb_raise(rb_eTypeError, "instance of IO needed");
    }
    if (rb_respond_to(io, id_binmode)) {
        rb_funcall(io, id_binmode, 0);
    }

    write_all(io, KDTREE_MAGIC, 4);
    write_all(io, kdtreep, sizeof(struct kdtree_data) - sizeof(struct kdtree_node *));
    write_all(io, kdtreep->nodes, sizeof(struct kdtree_node) * kdtreep->len);
    return io;
}
to_s → string click to toggle source

A string that tells you a bit about the tree.

static VALUE kdtree_to_s(VALUE kdtree)
{
    char buf[256];
    KDTREEP;

    sprintf(buf, "#<%s:%p nodes=%d>", rb_obj_classname(kdtree), (void*)kdtree, kdtreep->len);
    return rb_str_new(buf, strlen(buf));
}