Skip to content

Commit 3e95cfa

Browse files
committed
Fix C14N document subset canonicalization with namespace nodes
Remove the xns->next = NULL hack from rxml_xpath_object_wrap. In XPath results, libxml2 stores the parent element pointer in xns->next (see xmlXPathNodeSetAddNs). Zeroing it broke xmlC14NIsNodeInNodeset which relies on that pointer to match namespace nodes in the input node set. Instead, guard Namespace#next to detect the XPath convention by checking whether next points to a namespace (type == XML_LOCAL_NAMESPACE). When canonicalize receives an XPath::Object for :nodes, pass the raw xmlNodeSet directly to xmlC14NDocDumpMemory instead of roundtripping through Ruby objects, which dropped namespace nodes. Enable W3C C14N spec examples 3.5 (entity references) and 3.7 (document subsets) which were previously commented out or marked TODO.
1 parent 6e6e602 commit 3e95cfa

5 files changed

Lines changed: 71 additions & 57 deletions

File tree

ext/libxml/ruby_xml_document.c

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -275,42 +275,50 @@ rxml_document_canonicalize(int argc, VALUE *argv, VALUE self)
275275
//o_ns_prefixes will free at end of block
276276

277277
o_nodes = rb_hash_aref(option_hash, ID2SYM(rb_intern("nodes")));
278-
if (!NIL_P(o_nodes))
278+
if (!NIL_P(o_nodes))
279279
{
280-
int i;
281-
int p = 0; // index of pointer array
282-
VALUE * list_in = NULL;
283-
long node_list_size = 0;
284-
285280
if (CLASS_OF(o_nodes) == cXMLXPathObject)
286281
{
287-
o_nodes = rb_funcall(o_nodes, rb_intern("to_a"), 0);
282+
/* Use the raw xmlNodeSet directly to preserve namespace nodes
283+
which cannot survive a roundtrip through Ruby objects */
284+
rxml_xpath_object *rxpop;
285+
TypedData_Get_Struct(o_nodes, rxml_xpath_object, &rxml_xpath_object_data_type, rxpop);
286+
if (rxpop->xpop->nodesetval)
287+
{
288+
nodeset.nodeNr = rxpop->xpop->nodesetval->nodeNr;
289+
nodeset.nodeMax = rxpop->xpop->nodesetval->nodeMax;
290+
nodeset.nodeTab = rxpop->xpop->nodesetval->nodeTab;
291+
}
288292
}
289293
else
290294
{
291-
Check_Type(o_nodes, T_ARRAY);
292-
}
293-
list_in = RARRAY_PTR(o_nodes);
294-
node_list_size = RARRAY_LEN(o_nodes);
295+
int i;
296+
int p = 0;
297+
VALUE *list_in = NULL;
298+
long node_list_size = 0;
295299

296-
for (i=0; i < node_list_size; ++i)
297-
{
298-
if (p >= C14N_NODESET_LIMIT) { break; }
300+
Check_Type(o_nodes, T_ARRAY);
301+
list_in = RARRAY_PTR(o_nodes);
302+
node_list_size = RARRAY_LEN(o_nodes);
299303

300-
if (RTEST(list_in[i]))
304+
for (i=0; i < node_list_size; ++i)
301305
{
302-
xmlNodePtr node_ptr;
303-
TypedData_Get_Struct(list_in[i], xmlNode, &rxml_node_data_type, node_ptr);
304-
node_ptr_array[p] = node_ptr;
305-
p++;
306+
if (p >= C14N_NODESET_LIMIT) { break; }
307+
308+
if (RTEST(list_in[i]))
309+
{
310+
xmlNodePtr node_ptr;
311+
TypedData_Get_Struct(list_in[i], xmlNode, &rxml_node_data_type, node_ptr);
312+
node_ptr_array[p] = node_ptr;
313+
p++;
314+
}
306315
}
307-
}
308316

309-
// Need to set values in nodeset struct
310-
nodeset.nodeNr = (node_list_size > C14N_NODESET_LIMIT ?
311-
C14N_NODESET_LIMIT :
312-
(int)node_list_size);
313-
nodeset.nodeTab = node_ptr_array;
317+
nodeset.nodeNr = (node_list_size > C14N_NODESET_LIMIT ?
318+
C14N_NODESET_LIMIT :
319+
(int)node_list_size);
320+
nodeset.nodeTab = node_ptr_array;
321+
}
314322
}
315323
}//option_hash
316324

ext/libxml/ruby_xml_namespace.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,14 @@ static VALUE rxml_namespace_next(VALUE self)
137137
{
138138
xmlNsPtr xns;
139139
TypedData_Get_Struct(self, xmlNs, &rxml_namespace_type, xns);
140-
if (xns == NULL || xns->next == NULL)
140+
/* Guard against libxml2's XPath hack where xns->next stores a parent
141+
element pointer instead of the next namespace (see xmlXPathNodeSetAddNs
142+
in xpath.c). xmlNs.type and xmlNode.type share the same struct offset,
143+
so checking the type is safe even when next points to an xmlNode. */
144+
if (xns == NULL || xns->next == NULL || xns->next->type != XML_LOCAL_NAMESPACE)
141145
return (Qnil);
142-
else
143-
return rxml_namespace_wrap(xns->next);
146+
147+
return rxml_namespace_wrap(xns->next);
144148
}
145149

146150
void rxml_init_namespace(void)

ext/libxml/ruby_xml_xpath_object.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ static void rxml_xpath_object_mark(void *data)
6464
rb_gc_mark(rxpop->nsnodes);
6565
}
6666

67-
static const rb_data_type_t rxml_xpath_object_data_type = {
67+
const rb_data_type_t rxml_xpath_object_data_type = {
6868
.wrap_struct_name = "LibXML::XML::XPath::Object",
6969
.function = { .dmark = rxml_xpath_object_mark, .dfree = rxml_xpath_object_free },
7070
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
@@ -82,7 +82,11 @@ VALUE rxml_xpath_object_wrap(VALUE document, xmlDocPtr xdoc, xmlXPathObjectPtr x
8282
rxpopp->xdoc = xdoc;
8383
rxpopp->xpop = xpop;
8484

85-
/* Find all the extra namespace nodes and wrap them. */
85+
/* Find all the extra namespace nodes and wrap them. In XPath results,
86+
libxml2 stores the parent element pointer in xns->next (a hack -- see
87+
xmlXPathNodeSetAddNs in xpath.c). We leave that pointer intact so
88+
that xmlC14NIsNodeInNodeset can match namespace nodes for C14N.
89+
Namespace#next is guarded to detect this hack and return nil. */
8690
if (xpop->nodesetval && xpop->nodesetval->nodeNr)
8791
{
8892
for (i = 0; i < xpop->nodesetval->nodeNr; i++)
@@ -91,12 +95,6 @@ VALUE rxml_xpath_object_wrap(VALUE document, xmlDocPtr xdoc, xmlXPathObjectPtr x
9195
if (xnode != NULL && xnode->type == XML_NAMESPACE_DECL)
9296
{
9397
VALUE ns = Qnil;
94-
xmlNsPtr xns = (xmlNsPtr)xnode;
95-
96-
/* Get rid of libxml's -> next hack. The issue here is
97-
the rxml_namespace code assumes that ns->next refers
98-
to another namespace. */
99-
xns->next = NULL;
10098

10199
/* Specify a custom free function here since by default
102100
namespace nodes will not be freed */

ext/libxml/ruby_xml_xpath_object.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define __RXML_XPATH_OBJECT__
33

44
extern VALUE cXMLXPathObject;
5+
extern const rb_data_type_t rxml_xpath_object_data_type;
56

67
typedef struct rxml_xpath_object
78
{

test/test_canonicalize.rb

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,23 @@ def test_canonicalize_with_w3c_c14n_3_4
6767

6868
# (www.w3.org) 3.5 Entity References
6969
# http://www.w3.org/TR/xml-c14n#Example-Entities
70-
# (2012-02-20) Failing likely due to a logic error
71-
# - libxml2(c14n.c:1788) XML_ENTITY_REF_NODE is invalid node for parsing.
7270
def test_canonicalize_with_w3c_c14n_3_5
73-
#given_doc = LibXML::XML::Document.file(self.path('c14n/given/example-5.xml'))
71+
# NOENT resolves entity references, DTDLOAD loads the external entity
72+
options = LibXML::XML::Parser::Options::NOENT | LibXML::XML::Parser::Options::DTDLOAD
73+
given_doc = LibXML::XML::Document.file(self.path('c14n/given/example-5.xml'), options: options)
7474

7575
# With Comments
76-
#expected_with_comments = IO.read(self.path('c14n/result/with-comments/example-5'))
77-
78-
# TODO - CANNOT COMPLETE TEST unless libxml2 supports additional node types.
79-
#assert_equal(expected_with_comments, given_doc.canonicalize(:comments => true))
76+
expected_with_comments = IO.read(self.path('c14n/result/with-comments/example-5'))
77+
assert_equal(expected_with_comments, given_doc.canonicalize(:comments => true))
8078

8179
# Without Comments
82-
#expected_without_comments = IO.read(self.path('c14n/result/without-comments/example-5'))
83-
# TODO - CANNOT COMPLETE TEST unless libxml2 supports additional node types.
84-
#assert_equal(expected_without_comments, given_doc.canonicalize(:comments => false))
85-
#expected_1_1_without_comments = IO.read(self.path('c14n/result/1-1-without-comments/example-5'))
86-
#mode = LibXML::XML::Document::XML_C14N_1_1
87-
88-
# TODO - CANNOT COMPLETE TEST unless libxml2 supports additional node types.
89-
#assert_equal(expected_1_1_without_comments, given_doc.canonicalize(:mode => mode))
80+
expected_without_comments = IO.read(self.path('c14n/result/without-comments/example-5'))
81+
assert_equal(expected_without_comments, given_doc.canonicalize(:comments => false))
82+
83+
# Without Comments (XML_C14N_1_1)
84+
expected_1_1_without_comments = IO.read(self.path('c14n/result/1-1-without-comments/example-5'))
85+
mode = LibXML::XML::Document::XML_C14N_1_1
86+
assert_equal(expected_1_1_without_comments, given_doc.canonicalize(:mode => mode))
9087
end
9188

9289
# (www.w3.org) 3.6 UTF-8 Encoding
@@ -104,16 +101,22 @@ def test_canonicalize_with_w3c_c14n_3_6
104101
# (www.w3.org) 3.7 Document Subsets
105102
# http://www.w3.org/TR/xml-c14n#Example-DocSubsets
106103
def test_canonicalize_with_w3c_c14n_3_7
107-
# Non Canonicalized Document
108-
given_doc = LibXML::XML::Document.file(self.path('c14n/given/example-7.xml'))
104+
# Parse with DTDATTR and DTDLOAD so default attributes and ID types
105+
# from the internal DTD subset are materialized
106+
options = LibXML::XML::Parser::Options::DTDATTR | LibXML::XML::Parser::Options::DTDLOAD
107+
given_doc = LibXML::XML::Document.file(self.path('c14n/given/example-7.xml'),
108+
options: options)
109109
expected = IO.read(self.path('c14n/result/without-comments/example-7'))
110110

111-
e1_node = given_doc.find_first('ietf:e1', 'ietf:http://www.ietf.org')
111+
ctx = given_doc.context
112+
ctx.register_namespace('ietf', 'http://www.ietf.org')
112113

113-
# Select current node, all child nodes, all attributes and namespace nodes
114-
subdoc_nodes = e1_node.find("(.//.|.//@id|namespace::*)")
114+
# W3C C14N spec example 3.7 XPath expression for document subsets
115+
subdoc_nodes = ctx.find(
116+
"(//. | //@* | //namespace::*)" \
117+
"[self::ietf:e1 or (parent::ietf:e1 and not(self::text() or self::e2))" \
118+
" or count(id('E3')|ancestor-or-self::node()) = count(ancestor-or-self::node())]")
115119

116-
# TODO - This fails because the namespace nodes aren't taken into account
117120
assert_equal(expected, given_doc.canonicalize(:nodes => subdoc_nodes))
118121
end
119122
end

0 commit comments

Comments
 (0)