LibreOffice Module xmlreader (master) 1
xmlreader.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <sal/config.h>
21
22#include <cassert>
23#include <climits>
24
25#include <com/sun/star/container/NoSuchElementException.hpp>
26#include <com/sun/star/uno/RuntimeException.hpp>
27#include <osl/file.h>
28#include <rtl/character.hxx>
29#include <rtl/string.h>
30#include <rtl/ustring.hxx>
31#include <sal/log.hxx>
32#include <sal/types.h>
33#include <utility>
34#include <xmlreader/pad.hxx>
35#include <xmlreader/span.hxx>
37
38namespace xmlreader {
39
40namespace {
41
42bool isSpace(char c) {
43 switch (c) {
44 case '\x09':
45 case '\x0A':
46 case '\x0D':
47 case ' ':
48 return true;
49 default:
50 return false;
51 }
52}
53
54}
55
56XmlReader::XmlReader(OUString fileUrl)
57 : fileUrl_(std::move(fileUrl))
58 , fileHandle_(nullptr)
59{
60 oslFileError e = osl_openFile(
61 fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
62 switch (e)
63 {
64 case osl_File_E_None:
65 break;
66 case osl_File_E_NOENT:
67 throw css::container::NoSuchElementException( fileUrl_ );
68 default:
69 throw css::uno::RuntimeException(
70 "cannot open " + fileUrl_ + ": " + OUString::number(e));
71 }
72 e = osl_getFileSize(fileHandle_, &fileSize_);
73 if (e == osl_File_E_None) {
74 e = osl_mapFile(
76 osl_File_MapFlag_WillNeed);
77 }
78 if (e != osl_File_E_None) {
79 oslFileError e2 = osl_closeFile(fileHandle_);
80 if (e2 != osl_File_E_None) {
82 "xmlreader",
83 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
84 }
85 throw css::uno::RuntimeException(
86 "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
87 }
88 namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
89 namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
90 pos_ = static_cast< char * >(fileAddress_);
93 firstAttribute_ = true;
94}
95
97 if (!fileHandle_)
98 return;
99 oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
100 if (e != osl_File_E_None) {
101 SAL_WARN(
102 "xmlreader",
103 "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
104 }
105 e = osl_closeFile(fileHandle_);
106 if (e != osl_File_E_None) {
107 SAL_WARN(
108 "xmlreader",
109 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
110 }
111}
112
114 int id = toNamespaceId(namespaceIris_.size());
115 namespaceIris_.push_back(iri);
116 if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
117 // Old user layer .xcu files used the xsi namespace prefix without
118 // declaring a corresponding namespace binding, see issue 77174; reading
119 // those files during migration would fail without this hack that can be
120 // removed once migration is no longer relevant (see
121 // configmgr::Components::parseModificationLayer):
122 namespaces_.emplace_back(Span("xsi"), id);
123 }
124 return id;
125}
126
127XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
128{
129 switch (state_) {
130 case State::Content:
131 switch (reportText) {
132 case Text::NONE:
133 return handleSkippedText(data, nsId);
134 case Text::Raw:
135 return handleRawText(data);
136 default: // Text::Normalized
137 return handleNormalizedText(data);
138 }
139 case State::StartTag:
140 return handleStartTag(nsId, data);
141 case State::EndTag:
142 return handleEndTag();
145 return Result::End;
146 default: // State::Done
147 return Result::Done;
148 }
149}
150
151bool XmlReader::nextAttribute(int * nsId, Span * localName) {
152 assert(nsId != nullptr && localName != nullptr);
153 if (firstAttribute_) {
155 firstAttribute_ = false;
156 } else {
158 }
159 if (currentAttribute_ == attributes_.end()) {
160 return false;
161 }
162 if (currentAttribute_->nameColon == nullptr) {
163 *nsId = NAMESPACE_NONE;
164 *localName = Span(
165 currentAttribute_->nameBegin,
166 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
167 } else {
168 *nsId = getNamespaceId(
169 Span(
170 currentAttribute_->nameBegin,
171 currentAttribute_->nameColon - currentAttribute_->nameBegin));
172 *localName = Span(
173 currentAttribute_->nameColon + 1,
174 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
175 }
176 return true;
177}
178
179Span XmlReader::getAttributeValue(bool fullyNormalize) {
181 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
182 fullyNormalize);
183}
184
185int XmlReader::getNamespaceId(Span const & prefix) const {
186 auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
187 [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
188
189 if (i != namespaces_.rend())
190 return i->nsId;
191
192 return NAMESPACE_UNKNOWN;
193}
194
195
197 char const * p = text.begin;
198 sal_Int32 n = text.length;
199 for (;;) {
200 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
201 if (i < 0) {
202 break;
203 }
204 pad_.add(p, i);
205 p += i + 1;
206 n -= i + 1;
207 if (n == 0 || *p != '\x0A') {
208 pad_.add("\x0A");
209 }
210 }
211 pad_.add(p, n);
212}
213
215 while (isSpace(peek())) {
216 ++pos_;
217 }
218}
219
221 if (rtl_str_shortenedCompare_WithLength(
222 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
223 RTL_CONSTASCII_LENGTH("--")) !=
224 0)
225 {
226 return false;
227 }
228 pos_ += RTL_CONSTASCII_LENGTH("--");
229 sal_Int32 i = rtl_str_indexOfStr_WithLength(
230 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
231 if (i < 0) {
232 throw css::uno::RuntimeException(
233 "premature end (within comment) of " + fileUrl_ );
234 }
235 pos_ += i + RTL_CONSTASCII_LENGTH("--");
236 if (read() != '>') {
237 throw css::uno::RuntimeException(
238 "illegal \"--\" within comment in " + fileUrl_ );
239 }
240 return true;
241}
242
244 sal_Int32 i = rtl_str_indexOfStr_WithLength(
245 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
246 if (i < 0) {
247 throw css::uno::RuntimeException(
248 "bad '<?' in " + fileUrl_ );
249 }
250 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
251}
252
254 // Neither is it checked that the doctypedecl is at the correct position in
255 // the document, nor that it is well-formed:
256 for (;;) {
257 char c = read();
258 switch (c) {
259 case '\0': // i.e., EOF
260 throw css::uno::RuntimeException(
261 "premature end (within DTD) of " + fileUrl_ );
262 case '"':
263 case '\'':
264 {
265 sal_Int32 i = rtl_str_indexOfChar_WithLength(
266 pos_, end_ - pos_, c);
267 if (i < 0) {
268 throw css::uno::RuntimeException(
269 "premature end (within DTD) of " + fileUrl_ );
270 }
271 pos_ += i + 1;
272 }
273 break;
274 case '>':
275 return;
276 case '[':
277 for (;;) {
278 c = read();
279 switch (c) {
280 case '\0': // i.e., EOF
281 throw css::uno::RuntimeException(
282 "premature end (within DTD) of " + fileUrl_ );
283 case '"':
284 case '\'':
285 {
286 sal_Int32 i = rtl_str_indexOfChar_WithLength(
287 pos_, end_ - pos_, c);
288 if (i < 0) {
289 throw css::uno::RuntimeException(
290 "premature end (within DTD) of " + fileUrl_ );
291 }
292 pos_ += i + 1;
293 }
294 break;
295 case '<':
296 switch (read()) {
297 case '\0': // i.e., EOF
298 throw css::uno::RuntimeException(
299 "premature end (within DTD) of " + fileUrl_ );
300 case '!':
301 skipComment();
302 break;
303 case '?':
305 break;
306 default:
307 break;
308 }
309 break;
310 case ']':
311 skipSpace();
312 if (read() != '>') {
313 throw css::uno::RuntimeException(
314 "missing \">\" of DTD in " + fileUrl_ );
315 }
316 return;
317 default:
318 break;
319 }
320 }
321 default:
322 break;
323 }
324 }
325}
326
328 if (rtl_str_shortenedCompare_WithLength(
329 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
330 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
331 0)
332 {
333 return Span();
334 }
335 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
336 char const * begin = pos_;
337 sal_Int32 i = rtl_str_indexOfStr_WithLength(
338 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
339 if (i < 0) {
340 throw css::uno::RuntimeException(
341 "premature end (within CDATA section) of " + fileUrl_ );
342 }
343 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
344 return Span(begin, i);
345}
346
347bool XmlReader::scanName(char const ** nameColon) {
348 assert(nameColon != nullptr && *nameColon == nullptr);
349 for (char const * begin = pos_;; ++pos_) {
350 switch (peek()) {
351 case '\0': // i.e., EOF
352 case '\x09':
353 case '\x0A':
354 case '\x0D':
355 case ' ':
356 case '/':
357 case '=':
358 case '>':
359 return pos_ != begin;
360 case ':':
361 *nameColon = pos_;
362 break;
363 default:
364 break;
365 }
366 }
367}
368
369int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
370 assert(begin != nullptr && begin <= end);
371 Span iri(handleAttributeValue(begin, end, false));
372 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
373 if (namespaceIris_[i] == iri) {
374 return toNamespaceId(i);
375 }
376 }
378}
379
380char const * XmlReader::handleReference(char const * position, char const * end)
381{
382 assert(position != nullptr && *position == '&' && position < end);
383 ++position;
384 if (*position == '#') {
385 ++position;
386 sal_uInt32 val = 0;
387 char const * p;
388 if (*position == 'x') {
389 ++position;
390 p = position;
391 for (;; ++position) {
392 char c = *position;
393 if (c >= '0' && c <= '9') {
394 val = 16 * val + (c - '0');
395 } else if (c >= 'A' && c <= 'F') {
396 val = 16 * val + (c - 'A') + 10;
397 } else if (c >= 'a' && c <= 'f') {
398 val = 16 * val + (c - 'a') + 10;
399 } else {
400 break;
401 }
402 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
403 throw css::uno::RuntimeException(
404 "'&#x...' too large in " + fileUrl_ );
405 }
406 }
407 } else {
408 p = position;
409 for (;; ++position) {
410 char c = *position;
411 if (c >= '0' && c <= '9') {
412 val = 10 * val + (c - '0');
413 } else {
414 break;
415 }
416 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
417 throw css::uno::RuntimeException(
418 "'&#...' too large in " + fileUrl_ );
419 }
420 }
421 }
422 if (position == p || *position++ != ';') {
423 throw css::uno::RuntimeException(
424 "'&#...' missing ';' in " + fileUrl_ );
425 }
426 assert(rtl::isUnicodeCodePoint(val));
427 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
428 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
429 {
430 throw css::uno::RuntimeException(
431 "character reference denoting invalid character in " + fileUrl_ );
432 }
433 char buf[4];
434 sal_Int32 len;
435 if (val < 0x80) {
436 buf[0] = static_cast< char >(val);
437 len = 1;
438 } else if (val < 0x800) {
439 buf[0] = static_cast< char >((val >> 6) | 0xC0);
440 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
441 len = 2;
442 } else if (val < 0x10000) {
443 buf[0] = static_cast< char >((val >> 12) | 0xE0);
444 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
445 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
446 len = 3;
447 } else {
448 buf[0] = static_cast< char >((val >> 18) | 0xF0);
449 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
450 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
451 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
452 len = 4;
453 }
454 pad_.addEphemeral(buf, len);
455 return position;
456 } else {
457 struct EntityRef {
458 char const * inBegin;
459 sal_Int32 const inLength;
460 char const * outBegin;
461 sal_Int32 const outLength;
462 };
463 static EntityRef const refs[] = {
464 { RTL_CONSTASCII_STRINGPARAM("amp;"),
465 RTL_CONSTASCII_STRINGPARAM("&") },
466 { RTL_CONSTASCII_STRINGPARAM("lt;"),
467 RTL_CONSTASCII_STRINGPARAM("<") },
468 { RTL_CONSTASCII_STRINGPARAM("gt;"),
469 RTL_CONSTASCII_STRINGPARAM(">") },
470 { RTL_CONSTASCII_STRINGPARAM("apos;"),
471 RTL_CONSTASCII_STRINGPARAM("'") },
472 { RTL_CONSTASCII_STRINGPARAM("quot;"),
473 RTL_CONSTASCII_STRINGPARAM("\"") } };
474 for (const auto & ref : refs) {
475 if (rtl_str_shortenedCompare_WithLength(
476 position, end - position, ref.inBegin, ref.inLength,
477 ref.inLength) ==
478 0)
479 {
480 position += ref.inLength;
481 pad_.add(ref.outBegin, ref.outLength);
482 return position;
483 }
484 }
485 throw css::uno::RuntimeException(
486 "unknown entity reference in " + fileUrl_ );
487 }
488}
489
491 char const * begin, char const * end, bool fullyNormalize)
492{
493 pad_.clear();
494 if (fullyNormalize) {
495 while (begin != end && isSpace(*begin)) {
496 ++begin;
497 }
498 while (end != begin && isSpace(end[-1])) {
499 --end;
500 }
501 char const * p = begin;
502 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
503 // a single true space character can go into the current span,
504 // everything else breaks the span
505 Space space = SPACE_NONE;
506 while (p != end) {
507 switch (*p) {
508 case '\x09':
509 case '\x0A':
510 case '\x0D':
511 switch (space) {
512 case SPACE_NONE:
513 pad_.add(begin, p - begin);
514 pad_.add(" ");
515 space = SPACE_BREAK;
516 break;
517 case SPACE_SPAN:
518 pad_.add(begin, p - begin);
519 space = SPACE_BREAK;
520 break;
521 case SPACE_BREAK:
522 break;
523 }
524 begin = ++p;
525 break;
526 case ' ':
527 switch (space) {
528 case SPACE_NONE:
529 ++p;
530 space = SPACE_SPAN;
531 break;
532 case SPACE_SPAN:
533 pad_.add(begin, p - begin);
534 begin = ++p;
535 space = SPACE_BREAK;
536 break;
537 case SPACE_BREAK:
538 begin = ++p;
539 break;
540 }
541 break;
542 case '&':
543 pad_.add(begin, p - begin);
545 begin = p;
546 space = SPACE_NONE;
547 break;
548 default:
549 ++p;
550 space = SPACE_NONE;
551 break;
552 }
553 }
554 pad_.add(begin, p - begin);
555 } else {
556 char const * p = begin;
557 while (p != end) {
558 switch (*p) {
559 case '\x09':
560 case '\x0A':
561 pad_.add(begin, p - begin);
562 begin = ++p;
563 pad_.add(" ");
564 break;
565 case '\x0D':
566 pad_.add(begin, p - begin);
567 ++p;
568 if (peek() == '\x0A') {
569 ++p;
570 }
571 begin = p;
572 pad_.add(" ");
573 break;
574 case '&':
575 pad_.add(begin, p - begin);
577 begin = p;
578 break;
579 default:
580 ++p;
581 break;
582 }
583 }
584 pad_.add(begin, p - begin);
585 }
586 return pad_.get();
587}
588
590 assert(nsId != nullptr && localName);
591 char const * nameBegin = pos_;
592 char const * nameColon = nullptr;
593 if (!scanName(&nameColon)) {
594 throw css::uno::RuntimeException(
595 "bad tag name in " + fileUrl_ );
596 }
597 char const * nameEnd = pos_;
598 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
599 bool hasDefaultNs = false;
600 int defaultNsId = NAMESPACE_NONE;
601 attributes_.clear();
602 for (;;) {
603 char const * p = pos_;
604 skipSpace();
605 if (peek() == '/' || peek() == '>') {
606 break;
607 }
608 if (pos_ == p) {
609 throw css::uno::RuntimeException(
610 "missing whitespace before attribute in " + fileUrl_ );
611 }
612 char const * attrNameBegin = pos_;
613 char const * attrNameColon = nullptr;
614 if (!scanName(&attrNameColon)) {
615 throw css::uno::RuntimeException(
616 "bad attribute name in " + fileUrl_ );
617 }
618 char const * attrNameEnd = pos_;
619 skipSpace();
620 if (read() != '=') {
621 throw css::uno::RuntimeException(
622 "missing '=' in " + fileUrl_ );
623 }
624 skipSpace();
625 char del = read();
626 if (del != '\'' && del != '"') {
627 throw css::uno::RuntimeException(
628 "bad attribute value in " + fileUrl_ );
629 }
630 char const * valueBegin = pos_;
631 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
632 if (i < 0) {
633 throw css::uno::RuntimeException(
634 "unterminated attribute value in " + fileUrl_ );
635 }
636 char const * valueEnd = pos_ + i;
637 pos_ += i + 1;
638 if (attrNameColon == nullptr &&
639 Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
640 {
641 hasDefaultNs = true;
642 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
643 } else if (attrNameColon != nullptr &&
644 Span(attrNameBegin, attrNameColon - attrNameBegin) ==
645 "xmlns")
646 {
647 namespaces_.emplace_back(
648 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
649 scanNamespaceIri(valueBegin, valueEnd));
650 } else {
651 attributes_.emplace_back(
652 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
653 valueEnd);
654 }
655 }
656 if (!hasDefaultNs && !elements_.empty()) {
657 defaultNsId = elements_.top().defaultNamespaceId;
658 }
659 firstAttribute_ = true;
660 if (peek() == '/') {
662 ++pos_;
663 } else {
665 }
666 if (peek() != '>') {
667 throw css::uno::RuntimeException(
668 "missing '>' in " + fileUrl_ );
669 }
670 ++pos_;
671 elements_.push(
673 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
674 defaultNsId));
675 if (nameColon == nullptr) {
676 *nsId = defaultNsId;
677 *localName = Span(nameBegin, nameEnd - nameBegin);
678 } else {
679 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
680 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
681 }
682 return Result::Begin;
683}
684
686 if (elements_.empty()) {
687 throw css::uno::RuntimeException(
688 "spurious end tag in " + fileUrl_ );
689 }
690 char const * nameBegin = pos_;
691 char const * nameColon = nullptr;
692 if (!scanName(&nameColon) ||
693 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
694 {
695 throw css::uno::RuntimeException(
696 "tag mismatch in " + fileUrl_ );
697 }
699 skipSpace();
700 if (peek() != '>') {
701 throw css::uno::RuntimeException(
702 "missing '>' in " + fileUrl_ );
703 }
704 ++pos_;
705 return Result::End;
706}
707
709 assert(!elements_.empty());
710 auto end = elements_.top().inheritedNamespaces;
711 namespaces_.resize(end);
712 elements_.pop();
714}
715
717 for (;;) {
718 auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
719 if (!i) {
720 throw css::uno::RuntimeException(
721 "premature end of " + fileUrl_ );
722 }
723 pos_ = i + 1;
724 switch (peek()) {
725 case '!':
726 ++pos_;
727 if (!skipComment() && !scanCdataSection().is()) {
729 }
730 break;
731 case '/':
732 ++pos_;
733 return handleEndTag();
734 case '?':
735 ++pos_;
737 break;
738 default:
739 return handleStartTag(nsId, data);
740 }
741 }
742}
743
745 pad_.clear();
746 for (char const * begin = pos_;;) {
747 switch (peek()) {
748 case '\0': // i.e., EOF
749 throw css::uno::RuntimeException(
750 "premature end of " + fileUrl_ );
751 case '\x0D':
752 pad_.add(begin, pos_ - begin);
753 ++pos_;
754 if (peek() != '\x0A') {
755 pad_.add("\x0A");
756 }
757 begin = pos_;
758 break;
759 case '&':
760 pad_.add(begin, pos_ - begin);
762 begin = pos_;
763 break;
764 case '<':
765 pad_.add(begin, pos_ - begin);
766 ++pos_;
767 switch (peek()) {
768 case '!':
769 ++pos_;
770 if (!skipComment()) {
771 Span cdata(scanCdataSection());
772 if (cdata.is()) {
773 normalizeLineEnds(cdata);
774 } else {
776 }
777 }
778 begin = pos_;
779 break;
780 case '/':
781 *text = pad_.get();
782 ++pos_;
784 return Result::Text;
785 case '?':
786 ++pos_;
788 begin = pos_;
789 break;
790 default:
791 *text = pad_.get();
793 return Result::Text;
794 }
795 break;
796 default:
797 ++pos_;
798 break;
799 }
800 }
801}
802
804 pad_.clear();
805 char const * flowBegin = pos_;
806 char const * flowEnd = pos_;
807 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
808 // a single true space character can go into the current flow,
809 // everything else breaks the flow
810 Space space = SPACE_START;
811 for (;;) {
812 switch (peek()) {
813 case '\0': // i.e., EOF
814 throw css::uno::RuntimeException(
815 "premature end of " + fileUrl_ );
816 case '\x09':
817 case '\x0A':
818 case '\x0D':
819 switch (space) {
820 case SPACE_START:
821 case SPACE_BREAK:
822 break;
823 case SPACE_NONE:
824 case SPACE_SPAN:
825 space = SPACE_BREAK;
826 break;
827 }
828 ++pos_;
829 break;
830 case ' ':
831 switch (space) {
832 case SPACE_START:
833 case SPACE_BREAK:
834 break;
835 case SPACE_NONE:
836 space = SPACE_SPAN;
837 break;
838 case SPACE_SPAN:
839 space = SPACE_BREAK;
840 break;
841 }
842 ++pos_;
843 break;
844 case '&':
845 switch (space) {
846 case SPACE_START:
847 break;
848 case SPACE_NONE:
849 case SPACE_SPAN:
850 pad_.add(flowBegin, pos_ - flowBegin);
851 break;
852 case SPACE_BREAK:
853 pad_.add(flowBegin, flowEnd - flowBegin);
854 pad_.add(" ");
855 break;
856 }
858 flowBegin = pos_;
859 flowEnd = pos_;
860 space = SPACE_NONE;
861 break;
862 case '<':
863 ++pos_;
864 switch (peek()) {
865 case '!':
866 ++pos_;
867 if (skipComment()) {
868 space = SPACE_BREAK;
869 } else {
870 Span cdata(scanCdataSection());
871 if (cdata.is()) {
872 // CDATA is not normalized (similar to character
873 // references; it keeps the code simple), but it might
874 // arguably be better to normalize it:
875 switch (space) {
876 case SPACE_START:
877 break;
878 case SPACE_NONE:
879 case SPACE_SPAN:
880 pad_.add(flowBegin, pos_ - flowBegin);
881 break;
882 case SPACE_BREAK:
883 pad_.add(flowBegin, flowEnd - flowBegin);
884 pad_.add(" ");
885 break;
886 }
887 normalizeLineEnds(cdata);
888 flowBegin = pos_;
889 flowEnd = pos_;
890 space = SPACE_NONE;
891 } else {
893 }
894 }
895 break;
896 case '/':
897 ++pos_;
898 pad_.add(flowBegin, flowEnd - flowBegin);
899 *text = pad_.get();
901 return Result::Text;
902 case '?':
903 ++pos_;
905 space = SPACE_BREAK;
906 break;
907 default:
908 pad_.add(flowBegin, flowEnd - flowBegin);
909 *text = pad_.get();
911 return Result::Text;
912 }
913 break;
914 default:
915 switch (space) {
916 case SPACE_START:
917 flowBegin = pos_;
918 break;
919 case SPACE_NONE:
920 case SPACE_SPAN:
921 break;
922 case SPACE_BREAK:
923 pad_.add(flowBegin, flowEnd - flowBegin);
924 pad_.add(" ");
925 flowBegin = pos_;
926 break;
927 }
928 flowEnd = ++pos_;
929 space = SPACE_NONE;
930 break;
931 }
932 }
933}
934
935int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
936 assert(pos <= INT_MAX);
937 return static_cast< int >(pos);
938}
939
940}
941
942/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Span get() const
Definition: pad.cxx:58
SAL_DLLPRIVATE void addEphemeral(char const *begin, sal_Int32 length)
Definition: pad.cxx:43
void add(char const *begin, sal_Int32 length)
Definition: pad.cxx:30
void clear()
Definition: pad.cxx:52
SAL_DLLPRIVATE int scanNamespaceIri(char const *begin, char const *end)
Definition: xmlreader.cxx:369
int getNamespaceId(Span const &prefix) const
Definition: xmlreader.cxx:185
int registerNamespaceIri(Span const &iri)
Definition: xmlreader.cxx:113
SAL_DLLPRIVATE Result handleStartTag(int *nsId, Span *localName)
Definition: xmlreader.cxx:589
Attributes attributes_
Definition: xmlreader.hxx:178
SAL_DLLPRIVATE Result handleRawText(Span *text)
Definition: xmlreader.cxx:744
ElementStack elements_
Definition: xmlreader.hxx:174
OUString const fileUrl_
Definition: xmlreader.hxx:168
Result nextItem(Text reportText, Span *data, int *nsId)
Definition: xmlreader.cxx:127
oslFileHandle fileHandle_
Definition: xmlreader.hxx:169
SAL_DLLPRIVATE Result handleNormalizedText(Span *text)
Definition: xmlreader.cxx:803
char const * pos_
Definition: xmlreader.hxx:175
SAL_DLLPRIVATE void skipSpace()
Definition: xmlreader.cxx:214
static SAL_DLLPRIVATE int toNamespaceId(NamespaceIris::size_type pos)
Definition: xmlreader.cxx:935
SAL_DLLPRIVATE void handleElementEnd()
Definition: xmlreader.cxx:708
bool nextAttribute(int *nsId, Span *localName)
Definition: xmlreader.cxx:151
char const * end_
Definition: xmlreader.hxx:176
sal_uInt64 fileSize_
Definition: xmlreader.hxx:170
Attributes::iterator currentAttribute_
Definition: xmlreader.hxx:179
SAL_DLLPRIVATE char peek() const
Definition: xmlreader.hxx:129
SAL_DLLPRIVATE void skipDocumentTypeDeclaration()
Definition: xmlreader.cxx:253
SAL_DLLPRIVATE bool skipComment()
Definition: xmlreader.cxx:220
SAL_DLLPRIVATE char read()
Definition: xmlreader.hxx:127
SAL_DLLPRIVATE void skipProcessingInstruction()
Definition: xmlreader.cxx:243
SAL_DLLPRIVATE Result handleSkippedText(Span *data, int *nsId)
Definition: xmlreader.cxx:716
NamespaceList namespaces_
Definition: xmlreader.hxx:173
Span getAttributeValue(bool fullyNormalize)
Definition: xmlreader.cxx:179
SAL_DLLPRIVATE char const * handleReference(char const *position, char const *end)
Definition: xmlreader.cxx:380
NamespaceIris namespaceIris_
Definition: xmlreader.hxx:172
SAL_DLLPRIVATE void normalizeLineEnds(Span const &text)
Definition: xmlreader.cxx:196
SAL_DLLPRIVATE Span handleAttributeValue(char const *begin, char const *end, bool fullyNormalize)
Definition: xmlreader.cxx:490
SAL_DLLPRIVATE Span scanCdataSection()
Definition: xmlreader.cxx:327
SAL_DLLPRIVATE Result handleEndTag()
Definition: xmlreader.cxx:685
SAL_DLLPRIVATE bool scanName(char const **nameColon)
Definition: xmlreader.cxx:347
XmlReader(OUString fileUrl)
Definition: xmlreader.cxx:56
void * p
sal_Int64 n
#define SAL_WARN(area, stream)
def position(n=-1)
def text(shape, orig_st)
int i
enumrange< T >::Iterator begin(enumrange< T >)
end
bool is() const
Definition: span.hxx:50
size_t pos