include <ruby.h>
static VALUE rb_eCSVParseError; static ID s_read, s_to_str;
%%{
machine csv_scan; alphtype unsigned char; newline = ('\r\n' | '\n') @{ curline += 1; }; Separator = ','; UnQuotedValue = [^ \t",\r\n] [^",\r\n]*; QuotedChar = ( '""' | [^"] | newline ); QuotedValue = '"' . QuotedChar* . '"'; main := |* space; newline @{ rb_ary_push(row, coldata); rb_yield(row); coldata = Qnil; row = rb_ary_new(); }; Separator { rb_ary_push(row, coldata); coldata = Qnil; }; UnQuotedValue { unsigned char ch, *endp; int datalen; datalen = tokend - tokstart; endp = tokend - 1; while(datalen>0) { ch = *endp--; if (ch==' ' || ch=='\t') { datalen--; } else { break; } } if (datalen==0) { coldata = Qnil; } else { coldata = rb_str_new(tokstart, datalen); } }; QuotedValue { unsigned char ch, *start_p, *wptr, *rptr; int rest, datalen; start_p = wptr = tokstart; rptr = tokstart + 1; rest = tokend - tokstart - 2; datalen = 0; while(rest>0) { ch = *rptr++; if (ch=='"') { rptr++; rest--; } *wptr++ = ch; datalen++; rest--; } coldata = rb_str_new( start_p, datalen ); }; *|;
}%%
%% write data nofinal;
define BUFSIZE 131072
VALUE csv_scan(VALUE self, VALUE port) {
int cs, act, have = 0, nread = 0, curline = 1; unsigned char *tokstart = NULL, *tokend = NULL, *buf; VALUE row, coldata; VALUE bufsize = Qnil; int done=0, buffer_size; if ( !rb_respond_to( port, s_read ) ) { if ( rb_respond_to( port, s_to_str ) ) { port = rb_funcall( port, s_to_str, 0 ); StringValue(port); } else { rb_raise( rb_eArgError, "bad argument, String or IO only please." ); } } buffer_size = BUFSIZE; if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) { bufsize = rb_ivar_get(self, rb_intern("@buffer_size")); if (!NIL_P(bufsize)) { buffer_size = NUM2INT(bufsize); } } buf = ALLOC_N(unsigned char, buffer_size); %% write init; row = rb_ary_new(); coldata = Qnil; while( !done ) { VALUE str; unsigned char *p = buf + have, *pe; int len, space = buffer_size - have; if ( space == 0 ) { rb_raise(rb_eCSVParseError, "ran out of buffer on line %d.", curline); } if ( rb_respond_to( port, s_read ) ) { str = rb_funcall( port, s_read, 1, INT2FIX(space) ); } else { str = rb_str_substr( port, nread, space ); } StringValue(str); memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) ); len = RSTRING_LEN(str); nread += len; /* If this is the last buffer, tack on an EOF. */ if ( len < space ) { p[len++] = 0; done = 1; } pe = p + len; %% write exec; if ( cs == csv_scan_error ) { free(buf); rb_raise(rb_eCSVParseError, "parse error on line %d.", curline); } if ( tokstart == 0 ) { have = 0; } else { have = pe - tokstart; memmove( buf, tokstart, have ); tokend = buf + (tokend - tokstart); tokstart = buf; } } free(buf); return Qnil;
}
void Init_csvscan() {
VALUE mCSVScan = rb_define_module("CSVScan"); rb_define_attr(rb_singleton_class(mCSVScan), "buffer_size", 1, 1); rb_define_singleton_method(mCSVScan, "scan", csv_scan, 1); rb_eCSVParseError = rb_define_class_under(mCSVScan, "ParseError", rb_eException); s_read = rb_intern("read"); s_to_str = rb_intern("to_str");
}