class Kdtree
Kdtree is an insanely fast data structure for finding the nearest neighbor(s) to a given point. This implementation only supports 2d points. Also, it only supports static points - there is no way to edit the tree after it has been initialized. Kdtree should scale to millions of points, though it's only been tested with around 1 million. Once the tree is constructed, it can be searched with nearest and nearestk. To avoid the startup costs associated with creating a new tree, use persist to write the tree to disk. You can then construct the tree later from that file. points = [] points << [47.6, -122.3, 1] # Seattle points << [45.5, -122.8, 2] # Portland points << [40.7, -74.0, 3] # New York kd = Kdtree.new(points) # which city is closest to San Francisco? kd.nearest(34.1, -118.2) #=> 2 # which two cities are closest to San Francisco? kd.nearestk(34.1, -118.2, 2) #=> [2, 1]
For more information on kd trees, see:
Public Class Methods
Returns a new Kdtree
. To construct a tree, pass an array of points. Each point should be an array of the form [x, y, id]
, where x and y are floats and id is an integer. The id is arbitrary and will be returned to you whenever you search with nearest or nearestk.
# create a new tree points = [] points << [47.6, -122.3, 1] # Seattle points << [40.7, -74.0, 2] # New York kd = Kdtree.new(points)
Alternately, you can pass in an IO object containing a persisted kdtree. This makes it possible to build the tree in advance, persist it, and start it up quickly later. See persist for more information.
static VALUE kdtree_initialize(VALUE kdtree, VALUE arg) { KDTREEP; if (TYPE(arg) == T_ARRAY) { // init from array of pints VALUE points = arg; int i; kdtreep->len = RARRAY_LEN(points); kdtreep->nodes = ALLOC_N(struct kdtree_node, kdtreep->len); for (i = 0; i < RARRAY_LEN(points); ++i) { struct kdtree_node *n = kdtreep->nodes + i; VALUE ptr = rb_ary_entry(points, i); VALUE v = rb_check_array_type(ptr); if (NIL_P(v) || RARRAY_LEN(v) != 3) { continue; } n->x = NUM2DBL(rb_ary_entry(v, 0)); n->y = NUM2DBL(rb_ary_entry(v, 1)); n->id = NUM2INT(rb_ary_entry(v, 2)); } // now build the tree kdtreep->root = kdtree_build(kdtreep, 0, kdtreep->len, 0); } else if (rb_respond_to(arg, rb_intern("read"))) { VALUE io = arg; char buf[4]; if (rb_respond_to(io, id_binmode)) { rb_funcall(io, id_binmode, 0); } // check magic read_all(io, buf, 4); if (memcmp(KDTREE_MAGIC, buf, 4) != 0) { rb_raise(rb_eRuntimeError, "wrong magic number in kdtree file"); } // read start of the struct read_all(io, kdtreep, sizeof(struct kdtree_data) - sizeof(struct kdtree_node *)); // read the nodes kdtreep->nodes = ALLOC_N(struct kdtree_node, kdtreep->len); read_all(io, kdtreep->nodes, sizeof(struct kdtree_node) * kdtreep->len); } else { rb_raise(rb_eTypeError, "array or IO required to init Kdtree"); } return kdtree; }
Public Instance Methods
Finds the point closest to x, y and returns the id for that point. Returns -1 if the tree is empty.
points = [] points << [47.6, -122.3, 1] # Seattle points << [40.7, -74.0, 2] # New York kd = Kdtree.new(points) # which city is closest to Portland? kd.nearest(45.5, -122.8) #=> 1 # which city is closest to Boston? kd.nearest(42.4, -71.1) #=> 2
static VALUE kdtree_nearest(VALUE kdtree, VALUE x, VALUE y) { int n_index; float n_dist; KDTREEP; n_index = -1; n_dist = INT_MAX; kdtree_nearest0(kdtreep, kdtreep->root, NUM2DBL(x), NUM2DBL(y), 0, &n_index, &n_dist); if (n_index == -1) { return -1; } return INT2NUM((kdtreep->nodes + n_index)->id); }
Finds the k points closest to x, y. Returns an array of ids, sorted by distance. Returns an empty array if the tree is empty. Note that k is capped at 255.
points = [] points << [47.6, -122.3, 1] # Seattle points << [45.5, -122.8, 2] # Portland points << [40.7, -74.0, 3] # New York kd = Kdtree.new(points) # which two cities are closest to San Francisco? kd.nearestk(34.1, -118.2, 2) #=> [2, 1]
static VALUE kdtree_nearestk(VALUE kdtree, VALUE x, VALUE y, VALUE k) { // note I leave an extra slot here at the end because of the way our binary insert works kresult k_list[MAX_K + 1]; int k_len = 0; float k_dist = INT_MAX; int ki = NUM2INT(k); VALUE ary; int i; KDTREEP; if (ki < 1) { ki = 1; } else if (ki > MAX_K) { ki = MAX_K; } kdtree_nearestk0(kdtreep, kdtreep->root, NUM2DBL(x), NUM2DBL(y), ki, 0, k_list, &k_len, &k_dist); // convert result to ruby array ary = rb_ary_new(); for (i = 0; i < k_len; ++i) { rb_ary_push(ary, INT2NUM(kdtreep->nodes[k_list[i].index].id)); } return ary; }
Writes the tree out to io so you can quickly load it later with Kdtree.new
. This avoids the startup cost of initializing a tree. Apart from a small header, the size of the file is proportional to the number of points, requiring 20 bytes per point.
This file is NOT PORTABLE across different architectures due to endian issues.
points = [] points << [47.6, -122.3, 1] # Seattle points << [45.5, -122.8, 2] # Portland points << [40.7, -74.0, 3] # New York kd = Kdtree.new(points) # persist the tree to disk File.open("treefile", "w") { |f| kd.persist(f) } ... # later, read the tree from disk kd2 = File.open("treefile") { |f| Kdtree.new(f) }
static VALUE kdtree_persist(VALUE kdtree, VALUE io) { VALUE str; KDTREEP; if (!rb_respond_to(io, rb_intern("write"))) { rb_raise(rb_eTypeError, "instance of IO needed"); } if (rb_respond_to(io, id_binmode)) { rb_funcall(io, id_binmode, 0); } write_all(io, KDTREE_MAGIC, 4); write_all(io, kdtreep, sizeof(struct kdtree_data) - sizeof(struct kdtree_node *)); write_all(io, kdtreep->nodes, sizeof(struct kdtree_node) * kdtreep->len); return io; }
A string that tells you a bit about the tree.
static VALUE kdtree_to_s(VALUE kdtree) { char buf[256]; KDTREEP; sprintf(buf, "#<%s:%p nodes=%d>", rb_obj_classname(kdtree), (void*)kdtree, kdtreep->len); return rb_str_new(buf, strlen(buf)); }