module Nokogumbo

Constants

DEFAULT_MAX_ATTRIBUTES

The default maximum number of attributes per element.

DEFAULT_MAX_ERRORS

The default maximum number of errors for parsing a document or a fragment.

DEFAULT_MAX_TREE_DEPTH

The default maximum depth of the DOM tree produced by parsing a document or fragment.

LINE_SUPPORTED
VERSION

Public Class Methods

fragment(p1, p2, p3, p4, p5, p6) click to toggle source
static VALUE fragment (
  VALUE self,
  VALUE doc_fragment,
  VALUE tags,
  VALUE ctx,
  VALUE max_attributes,
  VALUE max_errors,
  VALUE max_depth
) {
  ID name = rb_intern_const("name");
  const char *ctx_tag;
  GumboNamespaceEnum ctx_ns;
  GumboQuirksModeEnum quirks_mode;
  bool form = false;
  const char *encoding = NULL;

  if (NIL_P(ctx)) {
    ctx_tag = "body";
    ctx_ns = GUMBO_NAMESPACE_HTML;
  } else if (TYPE(ctx) == T_STRING) {
    ctx_tag = StringValueCStr(ctx);
    ctx_ns = GUMBO_NAMESPACE_HTML;
    size_t len = RSTRING_LEN(ctx);
    const char *colon = memchr(ctx_tag, ':', len);
    if (colon) {
      switch (colon - ctx_tag) {
      case 3:
        if (st_strncasecmp(ctx_tag, "svg", 3) != 0)
          goto error;
        ctx_ns = GUMBO_NAMESPACE_SVG;
        break;
      case 4:
        if (st_strncasecmp(ctx_tag, "html", 4) == 0)
          ctx_ns = GUMBO_NAMESPACE_HTML;
        else if (st_strncasecmp(ctx_tag, "math", 4) == 0)
          ctx_ns = GUMBO_NAMESPACE_MATHML;
        else
          goto error;
        break;
      default:
      error:
        rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
      }
      ctx_tag = colon+1;
    } else {
      // For convenience, put 'svg' and 'math' in their namespaces.
      if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0)
        ctx_ns = GUMBO_NAMESPACE_SVG;
      else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0)
        ctx_ns = GUMBO_NAMESPACE_MATHML;
    }

    // Check if it's a form.
    form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
  } else {
    ID element_ = rb_intern_const("element?");

    // Context fragment name.
    VALUE tag_name = rb_funcall(ctx, name, 0);
    assert(RTEST(tag_name));
    Check_Type(tag_name, T_STRING);
    ctx_tag = StringValueCStr(tag_name);

    // Context fragment namespace.
    ctx_ns = lookup_namespace(ctx, true);

    // Check for a form ancestor, including self.
    for (VALUE node = ctx;
         !NIL_P(node);
         node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
      if (!RTEST(rb_funcall(node, element_, 0)))
        continue;
      VALUE element_name = rb_funcall(node, name, 0);
      if (RSTRING_LEN(element_name) == 4
          && !st_strcasecmp(RSTRING_PTR(element_name), "form")
          && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
        form = true;
        break;
      }
    }

    // Encoding.
    if (RSTRING_LEN(tag_name) == 14
        && !st_strcasecmp(ctx_tag, "annotation-xml")) {
      VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
                             rb_utf8_str_new_static("encoding", 8));
      if (RTEST(enc)) {
        Check_Type(enc, T_STRING);
        encoding = StringValueCStr(enc);
      }
    }
  }

  // Quirks mode.
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
  if (NIL_P(dtd)) {
    quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
  } else {
    VALUE dtd_name = rb_funcall(dtd, name, 0);
    VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
    VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
    quirks_mode = gumbo_compute_quirks_mode (
      NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name),
      NIL_P(pubid)? NULL:StringValueCStr(pubid),
      NIL_P(sysid)? NULL:StringValueCStr(sysid)
    );
  }

  // Perform a fragment parse.
  int depth = NUM2INT(max_depth);
  GumboOptions options = kGumboDefaultOptions;
  options.max_attributes = NUM2INT(max_attributes);
  options.max_errors = NUM2INT(max_errors);
  // Add one to account for the HTML element.
  options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
  options.fragment_context = ctx_tag;
  options.fragment_namespace = ctx_ns;
  options.fragment_encoding = encoding;
  options.quirks_mode = quirks_mode;
  options.fragment_context_has_form_ancestor = form;

  GumboOutput *output = perform_parse(&options, tags);
  ParseArgs args = {
    .output = output,
    .input = tags,
    .url_or_frag = doc_fragment,
    .doc = (xmlDocPtr)extract_xml_node(doc),
  };
  VALUE parse_args = wrap_parse_args(&args);
  rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
  return Qnil;
}
parse(p1, p2, p3, p4, p5) click to toggle source
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
  GumboOptions options = kGumboDefaultOptions;
  options.max_attributes = NUM2INT(max_attributes);
  options.max_errors = NUM2INT(max_errors);
  options.max_tree_depth = NUM2INT(max_depth);

  GumboOutput *output = perform_parse(&options, input);
  ParseArgs args = {
    .output = output,
    .input = input,
    .url_or_frag = url,
    .doc = NIL,
  };
  VALUE parse_args = wrap_parse_args(&args);

  return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
}