class Nokogiri::HTML4::Document
Public Class Methods
Create a new empty document with base URI uri
and external ID external_id
.
static VALUE rb_html_document_s_new(int argc, VALUE *argv, VALUE klass) { VALUE uri, external_id, rest, rb_doc; htmlDocPtr doc; rb_scan_args(argc, argv, "0*", &rest); uri = rb_ary_entry(rest, (long)0); external_id = rb_ary_entry(rest, (long)1); doc = htmlNewDoc( RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL, RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL ); rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv); return rb_doc ; }
Parse HTML4 input from a String or IO object, and return a new HTML4::Document
.
- Required Parameters
-
input
(String | IO) The content to be parsed.
- Optional Keyword Arguments
-
url:
(String) The base URI for this document. -
encoding:
(String) The name of the encoding that should be used when processing the document. When not provided, the encoding will be determined based on the document content. -
options:
(Nokogiri::XML::ParseOptions
) Configuration object that determines some behaviors during parsing. See ParseOptions for more information. The default value isParseOptions::DEFAULT_HTML
.
- Yields
-
If a block is given, a
Nokogiri::XML::ParseOptions
object is yielded to the block which can be configured before parsing. SeeNokogiri::XML::ParseOptions
for more information. - Returns
# File lib/nokogiri/html4/document.rb, line 189 def parse( input, url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, url: url_, encoding: encoding_, options: options_ ) options = Nokogiri::XML::ParseOptions.new(options) if Integer === options yield options if block_given? url ||= input.respond_to?(:path) ? input.path : nil if input.respond_to?(:encoding) unless input.encoding == Encoding::ASCII_8BIT encoding ||= input.encoding.name end end if input.respond_to?(:read) if input.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 input = input.expand_path.open url ||= input.path end unless encoding input = EncodingReader.new(input) begin return read_io(input, url, encoding, options.to_i) rescue EncodingReader::EncodingFound => e encoding = e.found_encoding end end return read_io(input, url, encoding, options.to_i) end # read_memory pukes on empty docs if input.nil? || input.empty? return encoding ? new.tap { |i| i.encoding = encoding } : new end encoding ||= EncodingReader.detect_encoding(input) read_memory(input, url, encoding, options.to_i) end
Read the HTML
document from io
with given url
, encoding
, and options
. See Nokogiri::HTML4.parse
static VALUE rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { VALUE rb_doc; VALUE rb_error_list = rb_ary_new(); htmlDocPtr c_doc; const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); int options = NUM2INT(rb_options); xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher); c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options); xmlSetStructuredErrorFunc(NULL, NULL); /* * If EncodingFound has occurred in EncodingReader, make sure to do * a cleanup and propagate the error. */ if (rb_respond_to(rb_io, id_encoding_found)) { VALUE encoding_found = rb_funcall(rb_io, id_encoding_found, 0); if (!NIL_P(encoding_found)) { xmlFreeDoc(c_doc); rb_exc_raise(encoding_found); } } if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) { VALUE rb_error ; xmlFreeDoc(c_doc); rb_error = rb_ary_entry(rb_error_list, 0); if (rb_error == Qnil) { rb_raise(rb_eRuntimeError, "Could not parse document"); } else { VALUE exception_message = rb_funcall(rb_error, id_to_s, 0); exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "), exception_message); rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError)); } return Qnil; } rb_doc = noko_xml_document_wrap(klass, c_doc); rb_iv_set(rb_doc, "@errors", rb_error_list); return rb_doc; }
Read the HTML
document contained in string
with given url
, encoding
, and options
. See Nokogiri::HTML4.parse
static VALUE rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) { VALUE rb_doc; VALUE rb_error_list = rb_ary_new(); htmlDocPtr c_doc; const char *c_buffer = StringValuePtr(rb_html); const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url); const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding); int html_len = (int)RSTRING_LEN(rb_html); int options = NUM2INT(rb_options); xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher); c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options); xmlSetStructuredErrorFunc(NULL, NULL); if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) { VALUE rb_error ; xmlFreeDoc(c_doc); rb_error = rb_ary_entry(rb_error_list, 0); if (rb_error == Qnil) { rb_raise(rb_eRuntimeError, "Could not parse document"); } else { VALUE exception_message = rb_funcall(rb_error, id_to_s, 0); exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "), exception_message); rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError)); } return Qnil; } rb_doc = noko_xml_document_wrap(klass, c_doc); rb_iv_set(rb_doc, "@errors", rb_error_list); return rb_doc; }
Public Instance Methods
Create a Nokogiri::XML::DocumentFragment
from tags
# File lib/nokogiri/html4/document.rb, line 149 def fragment(tags = nil) DocumentFragment.new(self, tags, root) end
Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.
# File lib/nokogiri/html4/document.rb, line 12 def meta_encoding if (meta = at_xpath("//meta[@charset]")) meta[:charset] elsif (meta = meta_content_type) meta["content"][/charset\s*=\s*([\w-]+)/i, 1] end end
Set the meta tag encoding for this document.
If an meta encoding tag is already present, its content is replaced with the given text.
Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, and before any text node or content element (typically <body>) if any.
The result when trying to set an encoding that is different from the document encoding is undefined.
Beware in CRuby, that libxml2 automatically inserts a meta tag into a head element.
# File lib/nokogiri/html4/document.rb, line 36 def meta_encoding=(encoding) if (meta = meta_content_type) meta["content"] = format("text/html; charset=%s", encoding) encoding elsif (meta = at_xpath("//meta[@charset]")) meta["charset"] = encoding else meta = XML::Node.new("meta", self) if (dtd = internal_subset) && dtd.html5_dtd? meta["charset"] = encoding else meta["http-equiv"] = "Content-Type" meta["content"] = format("text/html; charset=%s", encoding) end if (head = at_xpath("//head")) head.prepend_child(meta) else set_metadata_element(meta) end encoding end end
Serialize Node using options
. Save options can also be set using a block.
See also Nokogiri::XML::Node::SaveOptions
and Node.
These two statements are equivalent:
node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
or
node.serialize(:encoding => 'UTF-8') do |config| config.format.as_xml end
# File lib/nokogiri/html4/document.rb, line 142 def serialize(options = {}) options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML super end
Get the title string of this document. Return nil if there is no title tag.
# File lib/nokogiri/html4/document.rb, line 70 def title (title = at_xpath("//title")) && title.inner_text end
Set the title string of this document.
If a title element is already present, its content is replaced with the given text.
Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, right after a meta encoding/charset tag if any, and before any text node or content element (typically <body>) if any.
# File lib/nokogiri/html4/document.rb, line 85 def title=(text) tnode = XML::Text.new(text, self) if (title = at_xpath("//title")) title.children = tnode return text end title = XML::Node.new("title", self) << tnode if (head = at_xpath("//head")) head << title elsif (meta = at_xpath("//meta[@charset]") || meta_content_type) # better put after charset declaration meta.add_next_sibling(title) else set_metadata_element(title) end end
The type for this document
static VALUE rb_html_document_type(VALUE self) { htmlDocPtr doc = noko_xml_document_unwrap(self); return INT2NUM(doc->type); }
- Returns
-
The document type which determines CSS-to-XPath translation.
See XPathVisitor for more information.
# File lib/nokogiri/html4/document.rb, line 159 def xpath_doctype Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4 end