Orcus
Loading...
Searching...
No Matches
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9#define INCLUDED_ORCUS_SAX_PARSER_HPP
10
11#include "sax_parser_base.hpp"
12
13#include <string_view>
14
15namespace orcus {
16
18{
24 static constexpr uint8_t baseline_version = 10;
25};
26
28{
29public:
36 {
37 (void)dtd;
38 }
39
47 void start_declaration(std::string_view decl)
48 {
49 (void)decl;
50 }
51
57 void end_declaration(std::string_view decl)
58 {
59 (void)decl;
60 }
61
68 {
69 (void)elem;
70 }
71
78 {
79 (void)elem;
80 }
81
96 void characters(std::string_view val, bool transient)
97 {
98 (void)val; (void)transient;
99 }
100
110 {
111 (void)attr;
112 }
113};
114
130template<typename HandlerT, typename ConfigT = sax_parser_default_config>
132{
133public:
134 typedef HandlerT handler_type;
135 typedef ConfigT config_type;
136
137 sax_parser(std::string_view content, handler_type& handler);
138 ~sax_parser() = default;
139
140 void parse();
141
142private:
143
148 void header();
149 void body();
150 void element();
151 void element_open(std::ptrdiff_t begin_pos);
152 void element_close(std::ptrdiff_t begin_pos);
153 void special_tag();
154 void declaration(const char* name_check);
155 void cdata();
156 void doctype();
157 void characters();
158 void attribute();
159
160private:
161 handler_type& m_handler;
162};
163
164template<typename HandlerT, typename ConfigT>
165sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) :
166 sax::parser_base(content.data(), content.size()),
167 m_handler(handler)
168{
169}
170
171template<typename HandlerT, typename ConfigT>
172void sax_parser<HandlerT,ConfigT>::parse()
173{
174 m_nest_level = 0;
175 mp_char = mp_begin;
176 header();
177 skip_space_and_control();
178 body();
179
180 assert(m_buffer_pos == 0);
181}
182
183template<typename HandlerT, typename ConfigT>
184void sax_parser<HandlerT,ConfigT>::header()
185{
186 // we don't handle multi byte encodings so we can just skip bom entry if exists.
187 skip_bom();
188
189 // Allow leading whitespace in the XML stream.
190 // TODO : Make this configurable since strictly speaking such an XML
191 // sttream is invalid.
192 skip_space_and_control();
193
194 if (!has_char() || cur_char() != '<')
195 throw malformed_xml_error("xml file must begin with '<'.", offset());
196
197 if (config_type::baseline_version >= 11)
198 {
199 // XML version 1.1 requires a header declaration whereas in 1.0 it's
200 // optional.
201 if (next_char_checked() != '?')
202 throw malformed_xml_error("xml file must begin with '<?'.", offset());
203
204 declaration("xml");
205 }
206}
207
208template<typename HandlerT, typename ConfigT>
209void sax_parser<HandlerT,ConfigT>::body()
210{
211 while (has_char())
212 {
213 if (cur_char() == '<')
214 {
215 element();
216 if (!m_root_elem_open)
217 // Root element closed. Stop parsing.
218 return;
219 }
220 else if (m_nest_level)
221 // Call characters only when in xml hierarchy.
222 characters();
223 else
224 next();
225 }
226}
227
228template<typename HandlerT, typename ConfigT>
229void sax_parser<HandlerT,ConfigT>::element()
230{
231 assert(cur_char() == '<');
232 std::ptrdiff_t pos = offset();
233 char c = next_char_checked();
234 switch (c)
235 {
236 case '/':
237 element_close(pos);
238 return;
239 case '!':
240 special_tag();
241 return;
242 case '?':
243 declaration(nullptr);
244 return;
245 }
246
247 element_open(pos);
248}
249
250template<typename HandlerT, typename ConfigT>
251void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos)
252{
253 sax::parser_element elem;
254 element_name(elem, begin_pos);
255
256 while (true)
257 {
258 skip_space_and_control();
259 char c = cur_char_checked();
260 if (c == '/')
261 {
262 // Self-closing element: <element/>
263 if (next_and_char() != '>')
264 throw malformed_xml_error("expected '/>' to self-close the element.", offset());
265 next();
266 elem.end_pos = offset();
267 m_handler.start_element(elem);
268 reset_buffer_pos();
269 m_handler.end_element(elem);
270 if (!m_nest_level)
271 m_root_elem_open = false;
272#if ORCUS_DEBUG_SAX_PARSER
273 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
274#endif
275 return;
276 }
277 else if (c == '>')
278 {
279 // End of opening element: <element>
280 next();
281 elem.end_pos = offset();
282 nest_up();
283 m_handler.start_element(elem);
284 reset_buffer_pos();
285#if ORCUS_DEBUG_SAX_PARSER
286 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
287#endif
288 return;
289 }
290 else
291 attribute();
292 }
293}
294
295template<typename HandlerT, typename ConfigT>
296void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos)
297{
298 assert(cur_char() == '/');
299 nest_down();
300 next_check();
301 sax::parser_element elem;
302 element_name(elem, begin_pos);
303
304 if (cur_char() != '>')
305 throw malformed_xml_error("expected '>' to close the element.", offset());
306 next();
307 elem.end_pos = offset();
308
309 m_handler.end_element(elem);
310#if ORCUS_DEBUG_SAX_PARSER
311 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
312#endif
313 if (!m_nest_level)
314 m_root_elem_open = false;
315}
316
317template<typename HandlerT, typename ConfigT>
318void sax_parser<HandlerT,ConfigT>::special_tag()
319{
320 assert(cur_char() == '!');
321 // This can be either <![CDATA, <!--, or <!DOCTYPE.
322 size_t len = available_size();
323 if (len < 2)
324 throw malformed_xml_error("special tag too short.", offset());
325
326 switch (next_and_char())
327 {
328 case '-':
329 {
330 // Possibly comment.
331 if (next_and_char() != '-')
332 throw malformed_xml_error("comment expected.", offset());
333
334 len -= 2;
335 if (len < 3)
336 throw malformed_xml_error("malformed comment.", offset());
337
338 next();
339 comment();
340 }
341 break;
342 case '[':
343 {
344 // Possibly a CDATA.
345 expects_next("CDATA[", 6);
346 if (has_char())
347 cdata();
348 }
349 break;
350 case 'D':
351 {
352 // check if this is a DOCTYPE.
353 expects_next("OCTYPE", 6);
354 skip_space_and_control();
355 if (has_char())
356 doctype();
357 }
358 break;
359 default:
360 throw malformed_xml_error("failed to parse special tag.", offset());
361 }
362}
363
364template<typename HandlerT, typename ConfigT>
365void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check)
366{
367 assert(cur_char() == '?');
368 next_check();
369
370 // Get the declaration name first.
371 std::string_view decl_name;
372 name(decl_name);
373#if ORCUS_DEBUG_SAX_PARSER
374 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
375#endif
376
377 if (name_check && decl_name != name_check)
378 {
379 std::ostringstream os;
380 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
381 throw malformed_xml_error(os.str(), offset());
382 }
383
384 m_handler.start_declaration(decl_name);
385 skip_space_and_control();
386
387 // Parse the attributes.
388 while (cur_char_checked() != '?')
389 {
390 attribute();
391 skip_space_and_control();
392 }
393 if (next_char_checked() != '>')
394 throw malformed_xml_error("declaration must end with '?>'.", offset());
395
396 m_handler.end_declaration(decl_name);
397 reset_buffer_pos();
398 next();
399#if ORCUS_DEBUG_SAX_PARSER
400 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
401#endif
402}
403
404template<typename HandlerT, typename ConfigT>
405void sax_parser<HandlerT,ConfigT>::cdata()
406{
407 size_t len = available_size();
408 assert(len > 3);
409
410 // Parse until we reach ']]>'.
411 const char* p0 = mp_char;
412 size_t i = 0, match = 0;
413 for (char c = cur_char(); i < len; ++i, c = next_and_char())
414 {
415 if (c == ']')
416 {
417 // Be aware that we may encounter a series of more than two ']'
418 // characters, in which case we'll only count the last two.
419
420 if (match == 0)
421 // First ']'
422 ++match;
423 else if (match == 1)
424 // Second ']'
425 ++match;
426 }
427 else if (c == '>' && match == 2)
428 {
429 // Found ']]>'.
430 size_t cdata_len = i - 2;
431 m_handler.characters(std::string_view(p0, cdata_len), false);
432 next();
433 return;
434 }
435 else
436 match = 0;
437 }
438 throw malformed_xml_error("malformed CDATA section.", offset());
439}
440
441template<typename HandlerT, typename ConfigT>
442void sax_parser<HandlerT,ConfigT>::doctype()
443{
444 // Parse the root element first.
445 sax::doctype_declaration param;
446 name(param.root_element);
447 skip_space_and_control();
448
449 // Either PUBLIC or SYSTEM.
450 size_t len = available_size();
451 if (len < 6)
452 throw malformed_xml_error("DOCTYPE section too short.", offset());
453
454 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
455 char c = cur_char();
456 if (c == 'P')
457 {
458 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
459 throw malformed_xml_error("malformed DOCTYPE section.", offset());
460
461 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
462 }
463 else if (c == 'S')
464 {
465 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
466 throw malformed_xml_error("malformed DOCTYPE section.", offset());
467 }
468
469 next_check();
470 skip_space_and_control();
471
472 // Parse FPI.
473 value(param.fpi, false);
474
475 has_char_throw("DOCTYPE section too short.");
476 skip_space_and_control();
477 has_char_throw("DOCTYPE section too short.");
478
479 if (cur_char() == '>')
480 {
481 // Optional URI not given. Exit.
482#if ORCUS_DEBUG_SAX_PARSER
483 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
484#endif
485 m_handler.doctype(param);
486 next();
487 return;
488 }
489
490 // Parse optional URI.
491 value(param.uri, false);
492
493 has_char_throw("DOCTYPE section too short.");
494 skip_space_and_control();
495 has_char_throw("DOCTYPE section too short.");
496
497 if (cur_char() != '>')
498 throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
499
500#if ORCUS_DEBUG_SAX_PARSER
501 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
502#endif
503 m_handler.doctype(param);
504 next();
505}
506
507template<typename HandlerT, typename ConfigT>
508void sax_parser<HandlerT,ConfigT>::characters()
509{
510 const char* p0 = mp_char;
511 for (; has_char(); next())
512 {
513 if (cur_char() == '<')
514 break;
515
516 if (cur_char() == '&')
517 {
518 // Text span with one or more encoded characters. Parse using cell buffer.
519 cell_buffer& buf = get_cell_buffer();
520 buf.reset();
521 buf.append(p0, mp_char-p0);
522 characters_with_encoded_char(buf);
523 if (buf.empty())
524 m_handler.characters(std::string_view{}, false);
525 else
526 m_handler.characters(buf.str(), true);
527 return;
528 }
529 }
530
531 if (mp_char > p0)
532 {
533 std::string_view val(p0, mp_char-p0);
534 m_handler.characters(val, false);
535 }
536}
537
538template<typename HandlerT, typename ConfigT>
539void sax_parser<HandlerT,ConfigT>::attribute()
540{
541 sax::parser_attribute attr;
542 attribute_name(attr.ns, attr.name);
543
544#if ORCUS_DEBUG_SAX_PARSER
545 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
546#endif
547
548 skip_space_and_control();
549
550 char c = cur_char_checked();
551 if (c != '=')
552 {
553 std::ostringstream os;
554 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
555 throw malformed_xml_error(os.str(), offset());
556 }
557
558 next_check(); // skip the '='.
559 skip_space_and_control();
560
561 attr.transient = value(attr.value, true);
562 if (attr.transient)
563 // Value is stored in a temporary buffer. Push a new buffer.
564 inc_buffer_pos();
565
566#if ORCUS_DEBUG_SAX_PARSER
567 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
568#endif
569
570 m_handler.attribute(attr);
571}
572
573}
574
575#endif
576/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition parser_base.hpp:23
Definition sax_parser_base.hpp:108
Definition sax_parser.hpp:28
void end_declaration(std::string_view decl)
Definition sax_parser.hpp:57
void doctype(const orcus::sax::doctype_declaration &dtd)
Definition sax_parser.hpp:35
void attribute(const orcus::sax::parser_attribute &attr)
Definition sax_parser.hpp:109
void characters(std::string_view val, bool transient)
Definition sax_parser.hpp:96
void start_declaration(std::string_view decl)
Definition sax_parser.hpp:47
void end_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:77
void start_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:67
Definition sax_parser.hpp:132
Definition sax_parser_base.hpp:37
Definition sax_parser_base.hpp:96
Definition sax_parser_base.hpp:77
Definition sax_parser.hpp:18
static constexpr uint8_t baseline_version
Definition sax_parser.hpp:24