class Nokogiri::XML::Reader

The Reader parser allows you to effectively pull parse an XML document. Once instantiated, call Nokogiri::XML::Reader#each to iterate over each node. Note that you may only iterate over the document once!

Nokogiri::XML::Reader parses an XML document similar to the way a cursor would move. The Reader is given an XML document, and yields nodes to an each block.

The Reader parser might be good for when you need the speed and low memory usage of the SAX parser, but do not want to write a Document handler.

Here is an example of usage:

reader = Nokogiri::XML::Reader(<<-eoxml)
  <x xmlns:tenderlove='http://tenderlovemaking.com/'>
    <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
  </x>
eoxml

reader.each do |node|

  # node is an instance of Nokogiri::XML::Reader
  puts node.name

end

Nokogiri::XML::Reader#each can only be called once! Once the cursor moves through the entire document, you must parse the document again. It may be better to capture all information you need during a single iteration.

⚠ libxml2 does not support error recovery in the Reader parser. The ‘RECOVER` ParseOption is ignored. If a syntax error is encountered during parsing, an exception will be raised.

Constants

TYPE_ATTRIBUTE

Attribute node type

TYPE_CDATA

CDATA node type

TYPE_COMMENT

Comment node type

TYPE_DOCUMENT

Document node type

TYPE_DOCUMENT_FRAGMENT

Document Fragment node type

TYPE_DOCUMENT_TYPE

Document Type node type

TYPE_ELEMENT

Element node type

TYPE_END_ELEMENT

Element end node type

TYPE_END_ENTITY

Entity end node type

TYPE_ENTITY

Entity node type

TYPE_ENTITY_REFERENCE

Entity Reference node type

TYPE_NONE
TYPE_NOTATION

Notation node type

TYPE_PROCESSING_INSTRUCTION

PI node type

TYPE_SIGNIFICANT_WHITESPACE

Significant Whitespace node type

TYPE_TEXT

Text node type

TYPE_WHITESPACE

Whitespace node type

TYPE_XML_DECLARATION

XML Declaration node type

Attributes

errors[RW]

A list of errors encountered while parsing

source[R]

The XML source

Public Class Methods

from_io(io, url = nil, encoding = nil, options = 0) click to toggle source

Create a new reader that parses io

static VALUE
from_io(int argc, VALUE *argv, VALUE klass)
{
  VALUE rb_io, rb_url, encoding, rb_options;
  xmlTextReaderPtr reader;
  const char *c_url      = NULL;
  const char *c_encoding = NULL;
  int c_options           = 0;
  VALUE rb_reader, args[3];

  rb_scan_args(argc, argv, "13", &rb_io, &rb_url, &encoding, &rb_options);

  if (!RTEST(rb_io)) { rb_raise(rb_eArgError, "io cannot be nil"); }
  if (RTEST(rb_url)) { c_url = StringValueCStr(rb_url); }
  if (RTEST(encoding)) { c_encoding = StringValueCStr(encoding); }
  if (RTEST(rb_options)) { c_options = (int)NUM2INT(rb_options); }

  reader = xmlReaderForIO(
             (xmlInputReadCallback)noko_io_read,
             (xmlInputCloseCallback)noko_io_close,
             (void *)rb_io,
             c_url,
             c_encoding,
             c_options
           );

  if (reader == NULL) {
    xmlFreeTextReader(reader);
    rb_raise(rb_eRuntimeError, "couldn't create a parser");
  }

  rb_reader = TypedData_Wrap_Struct(klass, &xml_reader_type, reader);
  args[0] = rb_io;
  args[1] = rb_url;
  args[2] = encoding;
  rb_obj_call_init(rb_reader, 3, args);

  return rb_reader;
}
from_memory(string, url = nil, encoding = nil, options = 0) click to toggle source

Create a new reader that parses string

static VALUE
from_memory(int argc, VALUE *argv, VALUE klass)
{
  VALUE rb_buffer, rb_url, encoding, rb_options;
  xmlTextReaderPtr reader;
  const char *c_url      = NULL;
  const char *c_encoding = NULL;
  int c_options           = 0;
  VALUE rb_reader, args[3];

  rb_scan_args(argc, argv, "13", &rb_buffer, &rb_url, &encoding, &rb_options);

  if (!RTEST(rb_buffer)) { rb_raise(rb_eArgError, "string cannot be nil"); }
  if (RTEST(rb_url)) { c_url = StringValueCStr(rb_url); }
  if (RTEST(encoding)) { c_encoding = StringValueCStr(encoding); }
  if (RTEST(rb_options)) { c_options = (int)NUM2INT(rb_options); }

  reader = xmlReaderForMemory(
             StringValuePtr(rb_buffer),
             (int)RSTRING_LEN(rb_buffer),
             c_url,
             c_encoding,
             c_options
           );

  if (reader == NULL) {
    xmlFreeTextReader(reader);
    rb_raise(rb_eRuntimeError, "couldn't create a parser");
  }

  rb_reader = TypedData_Wrap_Struct(klass, &xml_reader_type, reader);
  args[0] = rb_buffer;
  args[1] = rb_url;
  args[2] = encoding;
  rb_obj_call_init(rb_reader, 3, args);

  return rb_reader;
}

Public Instance Methods

attribute(name) click to toggle source

Get the value of attribute named name

static VALUE
reader_attribute(VALUE self, VALUE name)
{
  xmlTextReaderPtr reader;
  xmlChar *value ;
  VALUE rb_value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  if (NIL_P(name)) { return Qnil; }
  name = StringValue(name) ;

  value = xmlTextReaderGetAttribute(reader, (xmlChar *)StringValueCStr(name));
  if (value == NULL) { return Qnil; }

  rb_value = NOKOGIRI_STR_NEW2(value);
  xmlFree(value);
  return rb_value;
}
attribute_at(index) click to toggle source

Get the value of attribute at index

static VALUE
attribute_at(VALUE self, VALUE index)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE rb_value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  if (NIL_P(index)) { return Qnil; }
  index = rb_Integer(index);

  value = xmlTextReaderGetAttributeNo(
            reader,
            (int)NUM2INT(index)
          );
  if (value == NULL) { return Qnil; }

  rb_value = NOKOGIRI_STR_NEW2(value);
  xmlFree(value);
  return rb_value;
}
attribute_count click to toggle source

Get the number of attributes for the current node

static VALUE
attribute_count(VALUE self)
{
  xmlTextReaderPtr reader;
  int count;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  count = xmlTextReaderAttributeCount(reader);
  if (count == -1) { return Qnil; }

  return INT2NUM(count);
}
attribute_hash() → Hash<String ⇒ String> click to toggle source

Get the attributes of the current node as a Hash of names and values.

See related: #attributes and #namespaces

static VALUE
rb_xml_reader_attribute_hash(VALUE rb_reader)
{
  VALUE rb_attributes = rb_hash_new();
  xmlTextReaderPtr c_reader;
  xmlNodePtr c_node;
  xmlAttrPtr c_property;
  VALUE rb_errors;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);

  if (!has_attributes(c_reader)) {
    return rb_attributes;
  }

  rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0);

  xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher);
  c_node = xmlTextReaderExpand(c_reader);
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (c_node == NULL) {
    if (RARRAY_LEN(rb_errors) > 0) {
      VALUE rb_error = rb_ary_entry(rb_errors, 0);
      VALUE exception_message = rb_funcall(rb_error, rb_intern("to_s"), 0);
      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
    }
    return Qnil;
  }

  c_property = c_node->properties;
  while (c_property != NULL) {
    VALUE rb_name = NOKOGIRI_STR_NEW2(c_property->name);
    VALUE rb_value = Qnil;
    xmlChar *c_value = xmlNodeGetContent((xmlNode *)c_property);

    if (c_value) {
      rb_value = NOKOGIRI_STR_NEW2(c_value);
      xmlFree(c_value);
    }

    rb_hash_aset(rb_attributes, rb_name, rb_value);

    c_property = c_property->next;
  }

  return rb_attributes;
}
attributes() click to toggle source

Get the attributes and namespaces of the current node as a Hash.

This is the union of Reader#attribute_hash and Reader#namespaces

Returns

(Hash<String, String>) Attribute names and values, and namespace prefixes and hrefs.

# File lib/nokogiri/xml/reader.rb, line 93
def attributes
  attribute_hash.merge(namespaces)
end
attributes? click to toggle source

Does this node have attributes?

static VALUE
attributes_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  eh = has_attributes(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}
base_uri click to toggle source

Get the xml:base of the node

static VALUE
rb_xml_reader_base_uri(VALUE rb_reader)
{
  VALUE rb_base_uri;
  xmlTextReaderPtr c_reader;
  xmlChar *c_base_uri;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);

  c_base_uri = xmlTextReaderBaseUri(c_reader);
  if (c_base_uri == NULL) {
    return Qnil;
  }

  rb_base_uri = NOKOGIRI_STR_NEW2(c_base_uri);
  xmlFree(c_base_uri);

  return rb_base_uri;
}
default? click to toggle source

Was an attribute generated from the default value in the DTD or schema?

static VALUE
default_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  eh = xmlTextReaderIsDefault(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}
depth click to toggle source

Get the depth of the node

static VALUE
depth(VALUE self)
{
  xmlTextReaderPtr reader;
  int depth;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  depth = xmlTextReaderDepth(reader);
  if (depth == -1) { return Qnil; }

  return INT2NUM(depth);
}
each() { |cursor| ... } click to toggle source

Move the cursor through the document yielding the cursor to the block

# File lib/nokogiri/xml/reader.rb, line 99
def each
  while (cursor = read)
    yield cursor
  end
end
empty_element? # → true or false click to toggle source

Returns true if the current node is empty, otherwise false.

static VALUE
empty_element_p(VALUE self)
{
  xmlTextReaderPtr reader;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  if (xmlTextReaderIsEmptyElement(reader)) {
    return Qtrue;
  }

  return Qfalse;
}
Also aliased as: self_closing?
encoding() click to toggle source
static VALUE
rb_xml_reader_encoding(VALUE rb_reader)
{
  xmlTextReaderPtr c_reader;
  const char *parser_encoding;
  VALUE constructor_encoding;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
  parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
  if (parser_encoding) {
    return NOKOGIRI_STR_NEW2(parser_encoding);
  }

  constructor_encoding = rb_iv_get(rb_reader, "@encoding");
  if (RTEST(constructor_encoding)) {
    return constructor_encoding;
  }

  return Qnil;
}
inner_xml click to toggle source

Read the contents of the current node, including child nodes and markup. Returns a utf-8 encoded string.

static VALUE
inner_xml(VALUE self)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE str;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  value = xmlTextReaderReadInnerXml(reader);

  str = Qnil;
  if (value) {
    str = NOKOGIRI_STR_NEW2((char *)value);
    xmlFree(value);
  }

  return str;
}
lang click to toggle source

Get the xml:lang scope within which the node resides.

static VALUE
lang(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *lang;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  lang = (const char *)xmlTextReaderConstXmlLang(reader);
  if (lang == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(lang);
}
local_name click to toggle source

Get the local name of the node

static VALUE
local_name(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *name;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  name = (const char *)xmlTextReaderConstLocalName(reader);
  if (name == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(name);
}
name click to toggle source

Get the name of the node. Returns a utf-8 encoded string.

static VALUE
name(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *name;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  name = (const char *)xmlTextReaderConstName(reader);
  if (name == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(name);
}
namespace_uri click to toggle source

Get the URI defining the namespace associated with the node

static VALUE
namespace_uri(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *uri;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  uri = (const char *)xmlTextReaderConstNamespaceUri(reader);
  if (uri == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(uri);
}
namespaces click to toggle source

Get a hash of namespaces for this Node

static VALUE
rb_xml_reader_namespaces(VALUE rb_reader)
{
  VALUE rb_namespaces = rb_hash_new() ;
  xmlTextReaderPtr c_reader;
  xmlNodePtr c_node;
  VALUE rb_errors;

  TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);

  if (! has_attributes(c_reader)) {
    return rb_namespaces ;
  }

  rb_errors = rb_funcall(rb_reader, rb_intern("errors"), 0);

  xmlSetStructuredErrorFunc((void *)rb_errors, Nokogiri_error_array_pusher);
  c_node = xmlTextReaderExpand(c_reader);
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (c_node == NULL) {
    if (RARRAY_LEN(rb_errors) > 0) {
      VALUE rb_error = rb_ary_entry(rb_errors, 0);
      VALUE exception_message = rb_funcall(rb_error, rb_intern("to_s"), 0);
      rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
    }
    return Qnil;
  }

  Nokogiri_xml_node_namespaces(c_node, rb_namespaces);

  return rb_namespaces ;
}
node_type click to toggle source

Get the type of readers current node

static VALUE
node_type(VALUE self)
{
  xmlTextReaderPtr reader;
  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  return INT2NUM(xmlTextReaderNodeType(reader));
}
outer_xml click to toggle source

Read the current node and its contents, including child nodes and markup. Returns a utf-8 encoded string.

static VALUE
outer_xml(VALUE self)
{
  xmlTextReaderPtr reader;
  xmlChar *value;
  VALUE str = Qnil;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  value = xmlTextReaderReadOuterXml(reader);

  if (value) {
    str = NOKOGIRI_STR_NEW2((char *)value);
    xmlFree(value);
  }
  return str;
}
prefix click to toggle source

Get the shorthand reference to the namespace associated with the node.

static VALUE
prefix(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *prefix;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  prefix = (const char *)xmlTextReaderConstPrefix(reader);
  if (prefix == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(prefix);
}
read click to toggle source

Move the Reader forward through the XML document.

static VALUE
read_more(VALUE self)
{
  xmlTextReaderPtr reader;
  xmlErrorConstPtr error;
  VALUE error_list;
  int ret;
  xmlDocPtr c_document;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

  error_list = rb_funcall(self, rb_intern("errors"), 0);

  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
  ret = xmlTextReaderRead(reader);
  xmlSetStructuredErrorFunc(NULL, NULL);

  c_document = xmlTextReaderCurrentDoc(reader);
  if (c_document && c_document->encoding == NULL) {
    VALUE constructor_encoding = rb_iv_get(self, "@encoding");
    if (RTEST(constructor_encoding)) {
      c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
    } else {
      rb_iv_set(self, "@encoding", NOKOGIRI_STR_NEW2("UTF-8"));
      c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
    }
  }

  if (ret == 1) { return self; }
  if (ret == 0) { return Qnil; }

  error = xmlGetLastError();
  if (error) {
    rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
  } else {
    rb_raise(rb_eRuntimeError, "Error pulling: %d", ret);
  }

  return Qnil;
}
self_closing?()
Alias for: empty_element?
state click to toggle source

Get the state of the reader

static VALUE
state(VALUE self)
{
  xmlTextReaderPtr reader;
  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  return INT2NUM(xmlTextReaderReadState(reader));
}
value click to toggle source

Get the text value of the node if present. Returns a utf-8 encoded string.

static VALUE
value(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *value;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  value = (const char *)xmlTextReaderConstValue(reader);
  if (value == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(value);
}
value? click to toggle source

Does this node have a text value?

static VALUE
value_eh(VALUE self)
{
  xmlTextReaderPtr reader;
  int eh;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  eh = xmlTextReaderHasValue(reader);
  if (eh == 0) { return Qfalse; }
  if (eh == 1) { return Qtrue; }

  return Qnil;
}
xml_version click to toggle source

Get the XML version of the document being read

static VALUE
xml_version(VALUE self)
{
  xmlTextReaderPtr reader;
  const char *version;

  TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
  version = (const char *)xmlTextReaderConstXmlVersion(reader);
  if (version == NULL) { return Qnil; }

  return NOKOGIRI_STR_NEW2(version);
}