class Nokogiri::XML::Document
Nokogiri::XML::Document
is the main entry point for dealing with XML documents. The Document
is created by parsing XML content from a String or an IO object. See Nokogiri::XML::Document.parse
for more information on parsing.
Document
inherits a great deal of functionality from its superclass Nokogiri::XML::Node
, so please read that class’s documentation as well.
Constants
- NCNAME_CHAR
- NCNAME_RE
- NCNAME_START_CHAR
-
See www.w3.org/TR/REC-xml-names/#ns-decl for more details. Note that we’re not attempting to handle unicode characters partly because libxml2 doesn’t handle unicode characters in NCNAMEs.
Attributes
The errors found while parsing a document.
- Returns
-
Array<Nokogiri::XML::SyntaxError>
When ‘true`, reparented elements without a namespace will inherit their new parent’s namespace (if one exists). Defaults to ‘false`.
- Returns
-
Boolean
Example: Default behavior of namespace inheritance
xml = <<~EOF <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> <foo:parent> </foo:parent> </root> EOF doc = Nokogiri::XML(xml) parent = doc.at_xpath("//foo:parent", "foo" => "http://nokogiri.org/default_ns/test/foo") parent.add_child("<child></child>") doc.to_xml # => <?xml version="1.0"?> # <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> # <foo:parent> # <child/> # </foo:parent> # </root>
Example: Setting namespace inheritance to ‘true`
xml = <<~EOF <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> <foo:parent> </foo:parent> </root> EOF doc = Nokogiri::XML(xml) doc.namespace_inheritance = true parent = doc.at_xpath("//foo:parent", "foo" => "http://nokogiri.org/default_ns/test/foo") parent.add_child("<child></child>") doc.to_xml # => <?xml version="1.0"?> # <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> # <foo:parent> # <foo:child/> # </foo:parent> # </root>
Since v1.12.4
Public Class Methods
Source
static VALUE new (int argc, VALUE *argv, VALUE klass) { xmlDocPtr doc; VALUE version, rest, rb_doc ; rb_scan_args(argc, argv, "0*", &rest); version = rb_ary_entry(rest, (long)0); if (NIL_P(version)) { version = rb_str_new2("1.0"); } doc = xmlNewDoc((xmlChar *)StringValueCStr(version)); rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv); return rb_doc ; }
Create a new empty document declaring XML
version version
.
Source
# File lib/nokogiri/xml/document.rb, line 56 def parse( string_or_io, url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_XML, url: url_, encoding: encoding_, options: options_ ) options = Nokogiri::XML::ParseOptions.new(options) if Integer === options yield options if block_given? url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil if empty_doc?(string_or_io) if options.strict? raise Nokogiri::XML::SyntaxError, "Empty document" else return encoding ? new.tap { |i| i.encoding = encoding } : new end end doc = if string_or_io.respond_to?(:read) # TODO: should we instead check for respond_to?(:to_path) ? if string_or_io.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 string_or_io = string_or_io.expand_path.open url ||= string_or_io.path end read_io(string_or_io, url, encoding, options.to_i) else # read_memory pukes on empty docs read_memory(string_or_io, url, encoding, options.to_i) end # do xinclude processing doc.do_xinclude(options) if options.xinclude? doc end
Parse XML input from a String or IO object, and return a new XML::Document
.
🛡 By default, Nokogiri
treats documents as untrusted, and so does not attempt to load DTDs or access the network. See Nokogiri::XML::ParseOptions
for a complete list of options; and that module’s DEFAULT_XML constant for what’s set (and not set) by default.
- Required Parameters
-
input
(String | IO) The content to be parsed.
- Optional Keyword Arguments
-
url:
(String) The base URI for this document. -
encoding:
(String) The name of the encoding that should be used when processing the document. When not provided, the encoding will be determined based on the document content. -
options:
(Nokogiri::XML::ParseOptions
) Configuration object that determines some behaviors during parsing. SeeParseOptions
for more information. The default value isParseOptions::DEFAULT_XML
.
- Yields
-
If a block is given, a
Nokogiri::XML::ParseOptions
object is yielded to the block which can be configured before parsing. SeeNokogiri::XML::ParseOptions
for more information. - Returns
Source
static VALUE noko_xml_document_s_read_io(VALUE rb_class, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this private. */ libxmlStructuredErrorHandlerState handler_state; VALUE rb_errors = rb_ary_new(); noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); xmlDocPtr c_document = xmlReadIO( (xmlInputReadCallback)noko_io_read, (xmlInputCloseCallback)noko_io_close, (void *)rb_io, c_url, c_enc, (int)NUM2INT(rb_options) ); noko__structured_error_func_restore(&handler_state); if (c_document == NULL) { xmlFreeDoc(c_document); VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); if (RB_TEST(exception)) { rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } } VALUE rb_document = noko_xml_document_wrap(rb_class, c_document); rb_iv_set(rb_document, "@errors", rb_errors); return rb_document; }
Create a new document from an IO object
Source
static VALUE noko_xml_document_s_read_memory(VALUE rb_class, VALUE rb_input, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this private. */ VALUE rb_errors = rb_ary_new(); xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); const char *c_buffer = StringValuePtr(rb_input); const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); const char *c_enc = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); int c_buffer_len = (int)RSTRING_LEN(rb_input); xmlDocPtr c_document = xmlReadMemory(c_buffer, c_buffer_len, c_url, c_enc, (int)NUM2INT(rb_options)); xmlSetStructuredErrorFunc(NULL, NULL); if (c_document == NULL) { VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); if (RB_TEST(exception)) { rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not parse document"); } } VALUE document = noko_xml_document_wrap(rb_class, c_document); rb_iv_set(document, "@errors", rb_errors); return document; }
Create a new document from a String
Source
# File lib/nokogiri/xml/document.rb, line 104
⚠ This method is only available when running JRuby.
Create a Document
using an existing Java DOM document object.
The returned Document
shares the same underlying data structure as the Java object, so changes in one are reflected in the other.
- Parameters
-
‘java_document` (Java::OrgW3cDom::Document) (The class `Java::OrgW3cDom::Document` is also accessible as `org.w3c.dom.Document`.)
- Returns
See also #to_java
Public Instance Methods
Source
# File lib/nokogiri/xml/document.rb, line 437 def add_child(node_or_tags) raise "A document may not have multiple root nodes." if (root && root.name != "nokogiri_text_wrapper") && !(node_or_tags.comment? || node_or_tags.processing_instruction?) node_or_tags = coerce(node_or_tags) if node_or_tags.is_a?(XML::NodeSet) raise "A document may not have multiple root nodes." if node_or_tags.size > 1 super(node_or_tags.first) else super end end
Nokogiri::XML::Node#add_child
Source
static VALUE rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self) { VALUE rb_mode; VALUE rb_namespaces; VALUE rb_comments_p; int c_mode = 0; xmlChar **c_namespaces; xmlDocPtr c_doc; xmlOutputBufferPtr c_obuf; xmlC14NIsVisibleCallback c_callback_wrapper = NULL; void *rb_callback = NULL; VALUE rb_cStringIO; VALUE rb_io; rb_scan_args(argc, argv, "03", &rb_mode, &rb_namespaces, &rb_comments_p); if (!NIL_P(rb_mode)) { Check_Type(rb_mode, T_FIXNUM); c_mode = NUM2INT(rb_mode); } if (!NIL_P(rb_namespaces)) { Check_Type(rb_namespaces, T_ARRAY); if (c_mode == XML_C14N_1_0 || c_mode == XML_C14N_1_1) { rb_raise(rb_eRuntimeError, "This canonicalizer does not support this operation"); } } c_doc = noko_xml_document_unwrap(self); rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO")); rb_io = rb_class_new_instance(0, 0, rb_cStringIO); c_obuf = xmlAllocOutputBuffer(NULL); c_obuf->writecallback = (xmlOutputWriteCallback)noko_io_write; c_obuf->closecallback = (xmlOutputCloseCallback)noko_io_close; c_obuf->context = (void *)rb_io; if (rb_block_given_p()) { c_callback_wrapper = block_caller; rb_callback = (void *)rb_block_proc(); } if (NIL_P(rb_namespaces)) { c_namespaces = NULL; } else { long ns_len = RARRAY_LEN(rb_namespaces); c_namespaces = ruby_xcalloc((size_t)ns_len + 1, sizeof(xmlChar *)); for (int j = 0 ; j < ns_len ; j++) { VALUE entry = rb_ary_entry(rb_namespaces, j); c_namespaces[j] = (xmlChar *)StringValueCStr(entry); } } xmlC14NExecute(c_doc, c_callback_wrapper, rb_callback, c_mode, c_namespaces, (int)RTEST(rb_comments_p), c_obuf); ruby_xfree(c_namespaces); xmlOutputBufferClose(c_obuf); return rb_funcall(rb_io, rb_intern("string"), 0); }
Canonicalize a document and return the results. Takes an optional block that takes two parameters: the obj
and that node’s parent
. The obj
will be either a Nokogiri::XML::Node
, or a Nokogiri::XML::Namespace
The block must return a non-nil, non-false value if the obj
passed in should be included in the canonicalized document.
Source
# File lib/nokogiri/xml/document.rb, line 223 def clone(level = 1) copy = OBJECT_CLONE_METHOD.bind_call(self) copy.initialize_copy_with_args(self, level) end
Clone this node.
- Parameters
-
level
(optional Integer). 0 is a shallow copy, 1 (the default) is a deep copy.
- Returns
-
The new
Nokogiri::XML::Document
Source
# File lib/nokogiri/xml/document.rb, line 361 def collect_namespaces xpath("//namespace::*").each_with_object({}) do |ns, hash| hash[["xmlns", ns.prefix].compact.join(":")] = ns.href if ns.prefix != "xml" end end
Recursively get all namespaces from this node and its subtree and return them as a hash.
⚠ This method will not handle duplicate namespace prefixes, since the return value is a hash.
Note that this method does an xpath lookup for nodes with namespaces, and as a result the order (and which duplicate prefix “wins”) may be dependent on the implementation of the underlying XML
library.
Example: Basic usage
Given this document:
<root xmlns="default" xmlns:foo="bar"> <bar xmlns:hello="world" /> </root>
This method will return:
{"xmlns:foo"=>"bar", "xmlns"=>"default", "xmlns:hello"=>"world"}
Example: Duplicate prefixes
Given this document:
<root xmlns:foo="bar"> <bar xmlns:foo="baz" /> </root>
The hash returned will be something like:
{"xmlns:foo" => "baz"}
Source
Source
Source
# File lib/nokogiri/xml/document.rb, line 276 def create_element(name, *contents_or_attrs, &block) elm = Nokogiri::XML::Element.new(name, self, &block) contents_or_attrs.each do |arg| case arg when Hash arg.each do |k, v| key = k.to_s if key =~ NCNAME_RE ns_name = Regexp.last_match(1) elm.add_namespace_definition(ns_name, v) else elm[k.to_s] = v.to_s end end else elm.content = arg end end if (ns = elm.namespace_definitions.find { |n| n.prefix.nil? || (n.prefix == "") }) elm.namespace = ns end elm end
Create a new Element
with ‘name` belonging to this document, optionally setting contents or attributes.
This method is not the most user-friendly option if your intention is to add a node to the document tree. Prefer one of the Nokogiri::XML::Node
methods like Node#add_child
, Node#add_next_sibling
, Node#replace
, etc. which will both create an element (or subtree) and place it in the document tree.
Arguments may be passed to initialize the element:
-
a Hash argument will be used to set attributes
-
a non-Hash object that responds to #to_s will be used to set the new node’s contents
A block may be passed to mutate the node.
- Parameters
-
‘name` (String)
-
‘contents_or_attrs` (#to_s, Hash)
- Yields
-
‘node` (
Nokogiri::XML::Element
) - Returns
Example: An empty element without attributes
doc.create_element("div") # => <div></div>
Example: An element with contents
doc.create_element("div", "contents") # => <div>contents</div>
Example: An element with attributes
doc.create_element("div", {"class" => "container"}) # => <div class='container'></div>
Example: An element with contents and attributes
doc.create_element("div", "contents", {"class" => "container"}) # => <div class='container'>contents</div>
Example: Passing a block to mutate the element
doc.create_element("div") { |node| node["class"] = "blue" if before_noon? }
Source
static VALUE noko_xml_document__create_entity(int argc, VALUE *argv, VALUE rb_document) { VALUE rb_name; VALUE rb_type; VALUE rb_ext_id; VALUE rb_sys_id; VALUE rb_content; rb_scan_args(argc, argv, "14", &rb_name, &rb_type, &rb_ext_id, &rb_sys_id, &rb_content); xmlDocPtr c_document = noko_xml_document_unwrap(rb_document); libxmlStructuredErrorHandlerState handler_state; VALUE rb_errors = rb_ary_new(); noko__structured_error_func_save_and_set(&handler_state, (void *)rb_errors, noko__error_array_pusher); xmlEntityPtr c_entity = xmlAddDocEntity( c_document, (xmlChar *)(NIL_P(rb_name) ? NULL : StringValueCStr(rb_name)), (int)(NIL_P(rb_type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(rb_type)), (xmlChar *)(NIL_P(rb_ext_id) ? NULL : StringValueCStr(rb_ext_id)), (xmlChar *)(NIL_P(rb_sys_id) ? NULL : StringValueCStr(rb_sys_id)), (xmlChar *)(NIL_P(rb_content) ? NULL : StringValueCStr(rb_content)) ); noko__structured_error_func_restore(&handler_state); if (NULL == c_entity) { VALUE exception = rb_funcall(cNokogiriXmlSyntaxError, rb_intern("aggregate"), 1, rb_errors); if (RB_TEST(exception)) { rb_exc_raise(exception); } else { rb_raise(rb_eRuntimeError, "Could not create entity"); } } return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)c_entity); }
Create a new entity named name
.
type
is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL
. See the constants on Nokogiri::XML::EntityDecl
for more information.
external_id
, system_id
, and content
set the External ID, System ID, and content respectively. All of these parameters are optional.
Source
Source
# File lib/nokogiri/xml/document.rb, line 501 def deconstruct_keys(keys) { root: root } end
Returns a hash describing the Document
, to use in pattern matching.
Valid keys and their values:
In the future, other keys may allow accessing things like doctype and processing instructions. If you have a use case and would like this functionality, please let us know by opening an issue or a discussion on the github project.
Example
doc = Nokogiri::XML.parse(<<~XML) <?xml version="1.0"?> <root> <child> </root> XML doc.deconstruct_keys([:root]) # => {:root=> # #(Element:0x35c { # name = "root", # children = [ # #(Text "\n" + " "), # #(Element:0x370 { name = "child", children = [ #(Text "\n")] }), # #(Text "\n")] # })}
Example of an empty document
doc = Nokogiri::XML::Document.new doc.deconstruct_keys([:root]) # => {:root=>nil}
Since v1.14.0
Source
# File lib/nokogiri/xml/document.rb, line 409 def decorate(node) return unless @decorators @decorators.each do |klass, list| next unless node.is_a?(klass) list.each { |mod| node.extend(mod) } end end
Apply any decorators to node
Source
# File lib/nokogiri/xml/document.rb, line 368 def decorators(key) @decorators ||= {} @decorators[key] ||= [] end
Get the list of decorators given key
Source
# File lib/nokogiri/xml/document.rb, line 321 def document self end
A reference to self
Source
# File lib/nokogiri/xml/document.rb, line 207 def dup(level = 1) copy = OBJECT_DUP_METHOD.bind_call(self) copy.initialize_copy_with_args(self, level) end
Duplicate this node.
- Parameters
-
level
(optional Integer). 0 is a shallow copy, 1 (the default) is a deep copy.
- Returns
-
The new
Nokogiri::XML::Document
Source
static VALUE encoding(VALUE self) { xmlDocPtr doc = noko_xml_document_unwrap(self); if (!doc->encoding) { return Qnil; } return NOKOGIRI_STR_NEW2(doc->encoding); }
Get the encoding for this Document
Source
static VALUE set_encoding(VALUE self, VALUE encoding) { xmlDocPtr doc = noko_xml_document_unwrap(self); if (doc->encoding) { xmlFree(DISCARD_CONST_QUAL_XMLCHAR(doc->encoding)); } doc->encoding = xmlStrdup((xmlChar *)StringValueCStr(encoding)); return encoding; }
Set the encoding string for this Document
Source
# File lib/nokogiri/xml/document.rb, line 429 def fragment(tags = nil) DocumentFragment.new(self, tags, root) end
Create a Nokogiri::XML::DocumentFragment
from tags
Returns an empty fragment if tags
is nil.
Source
# File lib/nokogiri/xml/document.rb, line 316 def name "document" end
The name of this document. Always returns “document”
Source
# File lib/nokogiri/xml/document.rb, line 422 def namespaces root ? root.namespaces : {} end
Get the hash of namespaces on the root Nokogiri::XML::Node
Source
static VALUE remove_namespaces_bang(VALUE self) { xmlDocPtr doc = noko_xml_document_unwrap(self); recursively_remove_namespaces_from_node((xmlNodePtr)doc); return self; }
Remove all namespaces from all nodes in the document.
This could be useful for developers who either don’t understand namespaces or don’t care about them.
The following example shows a use case, and you can decide for yourself whether this is a good thing or not:
doc = Nokogiri::XML <<-EOXML <root> <car xmlns:part="http://general-motors.com/"> <part:tire>Michelin Model XGV</part:tire> </car> <bicycle xmlns:part="http://schwinn.com/"> <part:tire>I'm a bicycle tire!</part:tire> </bicycle> </root> EOXML doc.xpath("//tire").to_s # => "" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>" doc.remove_namespaces! doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""
For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml.html
Source
static VALUE rb_xml_document_root(VALUE self) { xmlDocPtr c_document; xmlNodePtr c_root; c_document = noko_xml_document_unwrap(self); c_root = xmlDocGetRootElement(c_document); if (!c_root) { return Qnil; } return noko_xml_node_wrap(Qnil, c_root) ; }
Get the root node for this document.
Source
static VALUE rb_xml_document_root_set(VALUE self, VALUE rb_new_root) { xmlDocPtr c_document; xmlNodePtr c_new_root = NULL, c_current_root; c_document = noko_xml_document_unwrap(self); c_current_root = xmlDocGetRootElement(c_document); if (c_current_root) { xmlUnlinkNode(c_current_root); noko_xml_document_pin_node(c_current_root); } if (!NIL_P(rb_new_root)) { if (!rb_obj_is_kind_of(rb_new_root, cNokogiriXmlNode)) { rb_raise(rb_eArgError, "expected Nokogiri::XML::Node but received %"PRIsVALUE, rb_obj_class(rb_new_root)); } Noko_Node_Get_Struct(rb_new_root, xmlNode, c_new_root); /* If the new root's document is not the same as the current document, * then we need to dup the node in to this document. */ if (c_new_root->doc != c_document) { c_new_root = xmlDocCopyNode(c_new_root, c_document, 1); if (!c_new_root) { rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)"); } } } xmlDocSetRootElement(c_document, c_new_root); return rb_new_root; }
Set the root element on this document
Source
# File lib/nokogiri/xml/document.rb, line 398 def slop! unless decorators(XML::Node).include?(Nokogiri::Decorators::Slop) decorators(XML::Node) << Nokogiri::Decorators::Slop decorate! end self end
Explore a document with shortcut methods. See Nokogiri::Slop
for details.
Note that any nodes that have been instantiated before #slop!
is called will not be decorated with sloppy behavior. So, if you’re in irb, the preferred idiom is:
irb> doc = Nokogiri::Slop my_markup
and not
irb> doc = Nokogiri::HTML my_markup ... followed by irb's implicit inspect (and therefore instantiation of every node) ... irb> doc.slop! ... which does absolutely nothing.
Source
# File lib/nokogiri/xml/document.rb, line 122
⚠ This method is only available when running JRuby.
Returns the underlying Java DOM document object for this document.
The returned Java object shares the same underlying data structure as this document, so changes in one are reflected in the other.
- Returns
-
Java::OrgW3cDom::Document (The class ‘Java::OrgW3cDom::Document` is also accessible as `org.w3c.dom.Document`.)
See also Document.wrap
Source
static VALUE url(VALUE self) { xmlDocPtr doc = noko_xml_document_unwrap(self); if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); } return Qnil; }
Get the url name for this document.
Source
Source
Source
# File lib/nokogiri/xml/document.rb, line 457 def xpath_doctype Nokogiri::CSS::XPathVisitor::DoctypeConfig::XML end
- Returns
-
The document type which determines CSS-to-XPath translation.
See XPathVisitor for more information.