LibreOffice Module svl (master) 1
urihelper.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <memory>
21#include <string_view>
22
23#include <sal/config.h>
24
25#include <unicode/idna.h>
26
27#include <svl/urihelper.hxx>
28#include <com/sun/star/ucb/Command.hpp>
29#include <com/sun/star/ucb/IllegalIdentifierException.hpp>
30#include <com/sun/star/ucb/UniversalContentBroker.hpp>
31#include <com/sun/star/ucb/UnsupportedCommandException.hpp>
32#include <com/sun/star/ucb/XCommandEnvironment.hpp>
33#include <com/sun/star/ucb/XCommandProcessor.hpp>
34#include <com/sun/star/ucb/XContent.hpp>
35#include <com/sun/star/ucb/XUniversalContentBroker.hpp>
36#include <com/sun/star/uno/Any.hxx>
37#include <com/sun/star/uno/Exception.hpp>
38#include <com/sun/star/uno/Reference.hxx>
39#include <com/sun/star/uno/RuntimeException.hpp>
40#include <com/sun/star/uno/XComponentContext.hpp>
41#include <com/sun/star/uri/UriReferenceFactory.hpp>
42#include <com/sun/star/uri/XUriReference.hpp>
43#include <com/sun/star/uri/XUriReferenceFactory.hpp>
45#include <osl/diagnose.h>
46#include <rtl/character.hxx>
47#include <rtl/ustrbuf.hxx>
48#include <rtl/ustring.hxx>
49#include <sal/types.h>
50#include <sal/log.hxx>
51#include <tools/inetmime.hxx>
53
54using namespace com::sun::star;
55
56OUString URIHelper::SmartRel2Abs(INetURLObject const & rTheBaseURIRef,
57 OUString const & rTheRelURIRef,
58 Link<OUString *, bool> const & rMaybeFileHdl,
59 bool bCheckFileExists,
60 bool bIgnoreFragment,
61 INetURLObject::EncodeMechanism eEncodeMechanism,
62 INetURLObject::DecodeMechanism eDecodeMechanism,
63 rtl_TextEncoding eCharset,
64 FSysStyle eStyle)
65{
66 // Backwards compatibility:
67 if( rTheRelURIRef.startsWith("#") )
68 return rTheRelURIRef;
69
70 INetURLObject aAbsURIRef;
71 if (rTheBaseURIRef.HasError())
72 aAbsURIRef. SetSmartURL(rTheRelURIRef, eEncodeMechanism, eCharset, eStyle);
73 else
74 {
75 bool bWasAbsolute;
76 aAbsURIRef = rTheBaseURIRef.smartRel2Abs(rTheRelURIRef,
77 bWasAbsolute,
78 bIgnoreFragment,
79 eEncodeMechanism,
80 eCharset,
81 false/*bRelativeNonURIs*/,
82 eStyle);
83 if (bCheckFileExists
84 && !bWasAbsolute
85 && (aAbsURIRef.GetProtocol() == INetProtocol::File))
86 {
87 INetURLObject aNonFileURIRef;
88 aNonFileURIRef.SetSmartURL(rTheRelURIRef,
89 eEncodeMechanism,
90 eCharset,
91 eStyle);
92 if (!aNonFileURIRef.HasError()
93 && aNonFileURIRef.GetProtocol() != INetProtocol::File)
94 {
95 bool bMaybeFile = false;
96 if (rMaybeFileHdl.IsSet())
97 {
98 OUString aFilePath(rTheRelURIRef);
99 bMaybeFile = rMaybeFileHdl.Call(&aFilePath);
100 }
101 if (!bMaybeFile)
102 aAbsURIRef = aNonFileURIRef;
103 }
104 }
105 }
106 return aAbsURIRef.GetMainURL(eDecodeMechanism, eCharset);
107}
108
109namespace { Link<OUString *, bool> gMaybeFileHdl; }
110
112{
113 gMaybeFileHdl = rTheMaybeFileHdl;
114}
115
117{
118 return gMaybeFileHdl;
119}
120
121namespace {
122
123bool isAbsoluteHierarchicalUriReference(
124 css::uno::Reference< css::uri::XUriReference > const & uriReference)
125{
126 return uriReference.is() && uriReference->isAbsolute()
127 && !uriReference->hasRelativePath();
128}
129
130// To improve performance, assume that if for any prefix URL of a given
131// hierarchical URL either a UCB content cannot be created, or the UCB content
132// does not support the getCasePreservingURL command, then this will hold for
133// any other prefix URL of the given URL, too:
134enum Result { Success, GeneralFailure, SpecificFailure };
135
136Result normalizePrefix( css::uno::Reference< css::ucb::XUniversalContentBroker > const & broker,
137 OUString const & uri, OUString * normalized)
138{
139 OSL_ASSERT(broker.is() && normalized != nullptr);
140 css::uno::Reference< css::ucb::XContent > content;
141 try {
142 content = broker->queryContent(broker->createContentIdentifier(uri));
143 } catch (css::ucb::IllegalIdentifierException &) {}
144 if (!content.is()) {
145 return GeneralFailure;
146 }
147 try {
148 bool ok =
149 (css::uno::Reference< css::ucb::XCommandProcessor >(
150 content, css::uno::UNO_QUERY_THROW)->execute(
151 css::ucb::Command("getCasePreservingURL",
152 -1, css::uno::Any()),
153 0,
154 css::uno::Reference< css::ucb::XCommandEnvironment >())
155 >>= *normalized);
156 OSL_ASSERT(ok);
157 } catch (css::uno::RuntimeException &) {
158 throw;
159 } catch (css::ucb::UnsupportedCommandException &) {
160 return GeneralFailure;
161 } catch (css::uno::Exception &) {
162 return SpecificFailure;
163 }
164 return Success;
165}
166
167OUString normalize(
168 css::uno::Reference< css::ucb::XUniversalContentBroker > const & broker,
169 css::uno::Reference< css::uri::XUriReferenceFactory > const & uriFactory,
170 OUString const & uriReference)
171{
172 // normalizePrefix can potentially fail (a typically example being a file
173 // URL that denotes a non-existing resource); in such a case, try to
174 // normalize as long a prefix of the given URL as possible (i.e., normalize
175 // all the existing directories within the path):
176 OUString normalized;
177 sal_Int32 n = uriReference.indexOf('#');
178 normalized = n == -1 ? uriReference : uriReference.copy(0, n);
179 switch (normalizePrefix(broker, normalized, &normalized)) {
180 case Success:
181 return n == -1 ? normalized : normalized + uriReference.subView(n);
182 case GeneralFailure:
183 return uriReference;
184 case SpecificFailure:
185 default:
186 break;
187 }
188 css::uno::Reference< css::uri::XUriReference > ref(
189 uriFactory->parse(uriReference));
190 if (!isAbsoluteHierarchicalUriReference(ref)) {
191 return uriReference;
192 }
193 sal_Int32 count = ref->getPathSegmentCount();
194 if (count < 2) {
195 return uriReference;
196 }
197 OUStringBuffer head(ref->getScheme());
198 head.append(':');
199 if (ref->hasAuthority()) {
200 head.append("//" + ref->getAuthority());
201 }
202 for (sal_Int32 i = count - 1; i > 0; --i) {
203 OUStringBuffer buf(head);
204 for (sal_Int32 j = 0; j < i; ++j) {
205 buf.append('/');
206 buf.append(ref->getPathSegment(j));
207 }
208 normalized = buf.makeStringAndClear();
209 if (normalizePrefix(broker, normalized, &normalized) != SpecificFailure)
210 {
211 buf.append(normalized);
212 css::uno::Reference< css::uri::XUriReference > preRef(
213 uriFactory->parse(normalized));
214 if (!isAbsoluteHierarchicalUriReference(preRef)) {
215 // This could only happen if something is inconsistent:
216 break;
217 }
218 sal_Int32 preCount = preRef->getPathSegmentCount();
219 // normalizePrefix may have added or removed a final slash:
220 if (preCount != i) {
221 if (preCount == i - 1) {
222 buf.append('/');
223 } else if (preCount - 1 == i && !buf.isEmpty()
224 && buf[buf.getLength() - 1] == '/')
225 {
226 buf.setLength(buf.getLength() - 1);
227 } else {
228 // This could only happen if something is inconsistent:
229 break;
230 }
231 }
232 for (sal_Int32 j = i; j < count; ++j) {
233 buf.append('/');
234 buf.append(ref->getPathSegment(j));
235 }
236 if (ref->hasQuery()) {
237 buf.append('?');
238 buf.append(ref->getQuery());
239 }
240 if (ref->hasFragment()) {
241 buf.append('#');
242 buf.append(ref->getFragment());
243 }
244 return buf.makeStringAndClear();
245 }
246 }
247 return uriReference;
248}
249
250}
251
252css::uno::Reference< css::uri::XUriReference >
254 css::uno::Reference< css::uno::XComponentContext > const & context,
255 OUString const & baseUriReference, OUString const & uriReference)
256{
257 OSL_ASSERT(context.is());
258 css::uno::Reference< css::ucb::XUniversalContentBroker > broker(
259 css::ucb::UniversalContentBroker::create(context));
260 css::uno::Reference< css::uri::XUriReferenceFactory > uriFactory(
261 css::uri::UriReferenceFactory::create(context));
262 return uriFactory->makeRelative(
263 uriFactory->parse(normalize(broker, uriFactory, baseUriReference)),
264 uriFactory->parse(normalize(broker, uriFactory, uriReference)), true,
265 true, false);
266}
267
269 OUString const & baseUriReference, OUString const & uriReference)
270{
271 css::uno::Reference< css::uri::XUriReference > rel(
273 comphelper::getProcessComponentContext(), baseUriReference,
274 uriReference));
275 return rel.is() ? rel->getUriReference() : uriReference;
276}
277
278
279// FindFirstURLInText
280
281
282namespace {
283
284sal_Int32 nextChar(std::u16string_view rStr, sal_Int32 nPos)
285{
286 return rtl::isHighSurrogate(rStr[nPos])
287 && rStr.size() - nPos >= 2
288 && rtl::isLowSurrogate(rStr[nPos + 1]) ?
289 nPos + 2 : nPos + 1;
290}
291
292bool isBoundary1(CharClass const & rCharClass, OUString const & rStr,
293 sal_Int32 nPos, sal_Int32 nEnd)
294{
295 if (nPos == nEnd)
296 return true;
297 if (rCharClass.isLetterNumeric(rStr, nPos))
298 return false;
299 switch (rStr[nPos])
300 {
301 case '$':
302 case '%':
303 case '&':
304 case '-':
305 case '/':
306 case '@':
307 case '\\':
308 return false;
309 default:
310 return true;
311 }
312}
313
314bool isBoundary2(CharClass const & rCharClass, OUString const & rStr,
315 sal_Int32 nPos, sal_Int32 nEnd)
316{
317 if (nPos == nEnd)
318 return true;
319 if (rCharClass.isLetterNumeric(rStr, nPos))
320 return false;
321 switch (rStr[nPos])
322 {
323 case '!':
324 case '#':
325 case '$':
326 case '%':
327 case '&':
328 case '\'':
329 case '*':
330 case '+':
331 case '-':
332 case '/':
333 case '=':
334 case '?':
335 case '@':
336 case '^':
337 case '_':
338 case '`':
339 case '{':
340 case '|':
341 case '}':
342 case '~':
343 return false;
344 default:
345 return true;
346 }
347}
348
349// tdf#145381 Added MatchingBracketDepth counter to detect matching closing
350// brackets that are part of the uri
351bool checkWChar(CharClass const & rCharClass, OUString const & rStr,
352 sal_Int32 * pPos, sal_Int32 * pEnd,
353 sal_Int32 * pMatchingBracketDepth = nullptr,
354 bool bBackslash = false, bool bPipe = false)
355{
356 sal_Unicode c = rStr[*pPos];
357 if (rtl::isAscii(c))
358 {
359 static sal_uInt8 const aMap[128]
360 = { 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 1, 0, 0, 4, 4, 4, 1, // !"#$%&'
365 5, 6, 1, 1, 1, 4, 1, 4, // ()*+,-./
366 4, 4, 4, 4, 4, 4, 4, 4, // 01234567
367 4, 4, 1, 1, 0, 1, 0, 1, // 89:;<=>?
368 4, 4, 4, 4, 4, 4, 4, 4, // @ABCDEFG
369 4, 4, 4, 4, 4, 4, 4, 4, // HIJKLMNO
370 4, 4, 4, 4, 4, 4, 4, 4, // PQRSTUVW
371 4, 4, 4, 1, 2, 1, 0, 1, // XYZ[\]^_
372 0, 4, 4, 4, 4, 4, 4, 4, // `abcdefg
373 4, 4, 4, 4, 4, 4, 4, 4, // hijklmno
374 4, 4, 4, 4, 4, 4, 4, 4, // pqrstuvw
375 4, 4, 4, 0, 3, 0, 1, 0 }; // xyz{|}~
376 switch (aMap[c])
377 {
378 default: // not uric
379 return false;
380
381 case 1: // uric
382 ++(*pPos);
383 return true;
384
385 case 2: // "\"
386 if (bBackslash)
387 {
388 *pEnd = ++(*pPos);
389 return true;
390 }
391 else
392 return false;
393
394 case 3: // "|"
395 if (bPipe)
396 {
397 *pEnd = ++(*pPos);
398 return true;
399 }
400 else
401 return false;
402
403 case 4: // alpha, digit, "$", "%", "&", "-", "/", "@" (see
404 // isBoundary1)
405 *pEnd = ++(*pPos);
406 return true;
407
408 case 5: // opening bracket
409 ++(*pPos);
410 if(nullptr != pMatchingBracketDepth)
411 ++(*pMatchingBracketDepth);
412 return true;
413
414 case 6: // closing bracket
415 ++(*pPos);
416 if(nullptr != pMatchingBracketDepth && *pMatchingBracketDepth > 0)
417 {
418 --(*pMatchingBracketDepth);
419 // tdf#145381 When there was an opening bracket, detect this closing bracket
420 // as part of the uri
421 *pEnd = *pPos;
422 }
423 return true;
424
425 }
426 }
427 else if (rCharClass.isLetterNumeric(rStr, *pPos))
428 {
429 *pEnd = *pPos = nextChar(rStr, *pPos);
430 return true;
431 }
432 else
433 return false;
434}
435
436sal_uInt32 scanDomain(OUString const & rStr, sal_Int32 * pPos,
437 sal_Int32 nEnd)
438{
439 sal_Unicode const * pBuffer = rStr.getStr();
440 sal_Unicode const * p = pBuffer + *pPos;
441 sal_uInt32 nLabels = INetURLObject::scanDomain(p, pBuffer + nEnd, false);
442 *pPos = sal::static_int_cast< sal_Int32 >(p - pBuffer);
443 return nLabels;
444}
445
446}
447
448OUString URIHelper::FindFirstURLInText(OUString const & rText,
449 sal_Int32 & rBegin,
450 sal_Int32 & rEnd,
451 CharClass const & rCharClass,
453 rtl_TextEncoding eCharset)
454{
455 if (rBegin > rEnd || rEnd > rText.getLength())
456 return OUString();
457
458 // Search for the first substring of [rBegin..rEnd[ that matches any of the
459 // following productions (for which the appropriate style bit is set in
460 // eStyle, if applicable).
461
462 // 1st Production (known scheme):
463 // \B1 <one of the known schemes, except file> ":" 1*wchar ["#" 1*wchar]
464 // \B1
465
466 // 2nd Production (file):
467 // \B1 "FILE:" 1*(wchar / "\" / "|") ["#" 1*wchar] \B1
468
469 // 3rd Production (ftp):
470 // \B1 "FTP" 2*("." label) ["/" *wchar] ["#" 1*wchar] \B1
471
472 // 4th Production (http):
473 // \B1 "WWW" 2*("." label) ["/" *wchar] ["#" 1*wchar] \B1
474
475 // 5th Production (mailto):
476 // \B2 local-part "@" domain \B1
477
478 // 6th Production (UNC file):
479 // \B1 "\\" domain "\" *(wchar / "\") \B1
480
481 // 7th Production (DOS file):
482 // \B1 ALPHA ":\" *(wchar / "\") \B1
483
484 // 8th Production (Unix-like DOS file):
485 // \B1 ALPHA ":/" *(wchar / "\") \B1
486
487 // The productions use the following auxiliary rules.
488
489 // local-part = atom *("." atom)
490 // atom = 1*(alphanum / "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+"
491 // / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}"
492 // / "~")
493 // domain = label *("." label)
494 // label = alphanum [*(alphanum / "-") alphanum]
495 // alphanum = ALPHA / DIGIT
496 // wchar = <any uric character (ignoring the escaped rule), or "%", or
497 // a letter or digit (according to rCharClass)>
498
499 // "\B1" (boundary 1) stands for the beginning or end of the block of text,
500 // or a character that is neither (a) a letter or digit (according to
501 // rCharClass), nor (b) any of "$", "%", "&", "-", "/", "@", or "\".
502 // (FIXME: What was the rationale for this set of punctuation characters?)
503
504 // "\B2" (boundary 2) stands for the beginning or end of the block of text,
505 // or a character that is neither (a) a letter or digit (according to
506 // rCharClass), nor (b) any of "!", "#", "$", "%", "&", "'", "*", "+", "-",
507 // "/", "=", "?", "@", "^", "_", "`", "{", "|", "}", or "~" (i.e., an RFC
508 // 822 <atom> character, or "@" from \B1's set above).
509
510 // Productions 1--4, and 6--8 try to find a maximum-length match, but they
511 // stop at the first <wchar> character that is a "\B1" character which is
512 // only followed by "\B1" characters (taking "\" and "|" characters into
513 // account appropriately). Production 5 simply tries to find a maximum-
514 // length match.
515
516 // Productions 1--4 use the given eMechanism and eCharset. Productions 5--9
517 // use EncodeMechanism::All.
518
519 // Productions 6--9 are only applicable if the FSysStyle::Dos bit is set in
520 // eStyle.
521
522 // tdf#145381: In addition to the productions I added a mechanism to detect
523 // matching brackets. The task presents the case of an url that ends on a
524 // closing bracket. This needs to be detected as part of the uri in the case
525 // that a matching opening bracket exists.
526
527 bool bBoundary1 = true;
528 bool bBoundary2 = true;
529 for (sal_Int32 nPos = rBegin; nPos != rEnd; nPos = nextChar(rText, nPos))
530 {
531 sal_Unicode c = rText[nPos];
532 if (bBoundary1)
533 {
534 if (rtl::isAsciiAlpha(c))
535 {
536 sal_Int32 i = nPos;
537 INetProtocol eScheme = INetURLObject::CompareProtocolScheme(rText.subView(i, rEnd - i));
538 if (eScheme == INetProtocol::File) // 2nd
539 {
540 while (rText[i++] != ':') ;
541 sal_Int32 nPrefixEnd = i;
542 sal_Int32 nUriEnd = i;
543 while (i != rEnd
544 && checkWChar(rCharClass, rText, &i, &nUriEnd, nullptr, true,
545 true)) ;
546 if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
547 {
548 ++i;
549 while (i != rEnd
550 && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
551 }
552 if (nUriEnd != nPrefixEnd
553 && isBoundary1(rCharClass, rText, nUriEnd, rEnd))
554 {
555 INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
556 INetProtocol::File, eMechanism, eCharset,
557 FSysStyle::Detect);
558 if (!aUri.HasError())
559 {
560 rBegin = nPos;
561 rEnd = nUriEnd;
562 return
564 }
565 }
566 }
567 else if (eScheme != INetProtocol::NotValid) // 1st
568 {
569 while (rText[i++] != ':') ;
570 sal_Int32 nPrefixEnd = i;
571 sal_Int32 nUriEnd = i;
572 sal_Int32 nMatchingBracketDepth = 0;
573 while (i != rEnd
574 && checkWChar(rCharClass, rText, &i, &nUriEnd,
575 &nMatchingBracketDepth)) ;
576 if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
577 {
578 ++i;
579 while (i != rEnd
580 && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
581 }
582 if (nUriEnd != nPrefixEnd
583 && (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
584 || rText[nUriEnd] == '\\'))
585 {
586 INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
587 INetProtocol::Http, eMechanism,
588 eCharset);
589 if (!aUri.HasError())
590 {
591 rBegin = nPos;
592 rEnd = nUriEnd;
593 return
595 }
596 }
597 }
598
599 // 3rd, 4th:
600 i = nPos;
601 sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
602 if (nLabels >= 3
603 && rText[nPos + 3] == '.'
604 && (((rText[nPos] == 'w'
605 || rText[nPos] == 'W')
606 && (rText[nPos + 1] == 'w'
607 || rText[nPos + 1] == 'W')
608 && (rText[nPos + 2] == 'w'
609 || rText[nPos + 2] == 'W'))
610 || ((rText[nPos] == 'f'
611 || rText[nPos] == 'F')
612 && (rText[nPos + 1] == 't'
613 || rText[nPos + 1] == 'T')
614 && (rText[nPos + 2] == 'p'
615 || rText[nPos + 2] == 'P'))))
616 // (note that rText.GetChar(nPos + 3) is guaranteed to be
617 // valid)
618 {
619 sal_Int32 nUriEnd = i;
620 if (i != rEnd && rText[i] == '/')
621 {
622 nUriEnd = ++i;
623 while (i != rEnd
624 && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
625 }
626 if (i != rEnd && rText[i] == '#')
627 {
628 ++i;
629 while (i != rEnd
630 && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
631 }
632 if (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
633 || rText[nUriEnd] == '\\')
634 {
635 INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
636 INetProtocol::Http, eMechanism,
637 eCharset);
638 if (!aUri.HasError())
639 {
640 rBegin = nPos;
641 rEnd = nUriEnd;
642 return
644 }
645 }
646 }
647
648 if (rEnd - nPos >= 3
649 && rText[nPos + 1] == ':'
650 && (rText[nPos + 2] == '/'
651 || rText[nPos + 2] == '\\')) // 7th, 8th
652 {
653 i = nPos + 3;
654 sal_Int32 nUriEnd = i;
655 while (i != rEnd
656 && checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
657 if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
658 {
659 INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
660 INetProtocol::File,
662 RTL_TEXTENCODING_UTF8,
663 FSysStyle::Dos);
664 if (!aUri.HasError())
665 {
666 rBegin = nPos;
667 rEnd = nUriEnd;
668 return
670 }
671 }
672 }
673 }
674 else if (rEnd - nPos >= 2
675 && rText[nPos] == '\\'
676 && rText[nPos + 1] == '\\') // 6th
677 {
678 sal_Int32 i = nPos + 2;
679 sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
680 if (nLabels >= 1 && i != rEnd && rText[i] == '\\')
681 {
682 sal_Int32 nUriEnd = ++i;
683 while (i != rEnd
684 && checkWChar(rCharClass, rText, &i, &nUriEnd,
685 nullptr, true)) ;
686 if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
687 {
688 INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
689 INetProtocol::File,
691 RTL_TEXTENCODING_UTF8,
692 FSysStyle::Dos);
693 if (!aUri.HasError())
694 {
695 rBegin = nPos;
696 rEnd = nUriEnd;
697 return
699 }
700 }
701 }
702 }
703 }
704 if (bBoundary2 && INetMIME::isAtomChar(c)) // 5th
705 {
706 bool bDot = false;
707 for (sal_Int32 i = nPos + 1; i != rEnd; ++i)
708 {
709 sal_Unicode c2 = rText[i];
710 if (INetMIME::isAtomChar(c2))
711 bDot = false;
712 else if (bDot)
713 break;
714 else if (c2 == '.')
715 bDot = true;
716 else
717 {
718 if (c2 == '@')
719 {
720 ++i;
721 sal_uInt32 nLabels = scanDomain(rText, &i, rEnd);
722 if (nLabels >= 1
723 && isBoundary1(rCharClass, rText, i, rEnd))
724 {
725 INetURLObject aUri(rText.subView(nPos, i - nPos),
726 INetProtocol::Mailto,
728 if (!aUri.HasError())
729 {
730 rBegin = nPos;
731 rEnd = i;
732 return aUri.GetMainURL(
734 }
735 }
736 }
737 break;
738 }
739 }
740 }
741 bBoundary1 = isBoundary1(rCharClass, rText, nPos, rEnd);
742 bBoundary2 = isBoundary2(rCharClass, rText, nPos, rEnd);
743 }
744 rBegin = rEnd;
745 return OUString();
746}
747
748OUString URIHelper::FindFirstDOIInText(OUString const & rText,
749 sal_Int32 & rBegin,
750 sal_Int32 & rEnd,
751 CharClass const & rCharClass)
752{
753 if (rBegin > rEnd || rEnd > rText.getLength())
754 return OUString();
755
756 sal_Int32 start = 7;
757 sal_Int32 count = rEnd-rBegin;
758 OUString candidate(rText.subView(rBegin, count));
759 // Match with regex "doi:10\.\d{4,9}\/[-._;()\/:a-zA-Z0-9]+"
760 if (candidate.startsWithIgnoreAsciiCase("doi:10."))
761 {
762 bool flag = true;
763 sal_Int32 digit = 0;
764 for (sal_Int32 i=start; i<count; i++)
765 {
766 sal_Unicode c = candidate[i];
767 // Match 4 to 9 digits before slash
768 if (digit >= 0)
769 {
770 if (digit>9)
771 {
772 flag = false;
773 break;
774 }
775
776 if ( rCharClass.isDigit(candidate,i) )
777 {
778 digit++;
779 }
780 else if (c=='/' && digit>=4 && i<count-1)
781 {
782 digit=-1;
783 }
784 else
785 {
786 flag = false;
787 break;
788 }
789 }
790 // Match [-._;()\/:a-zA-Z0-9] after slash
791 else if (!( rCharClass.isAlphaNumeric(candidate, i) || c == '.' || c == '-' || c=='_' ||
792 c==';' || c=='(' || c==')' || c=='\\' || (c=='/' && i<count-1) || c==':'))
793 {
794 flag = false;
795 break;
796 }
797 }
798 if (flag && digit==-1)
799 {
800 return OUString::Concat("https://doi.org/")+candidate.subView(4);
801 }
802 }
803 rBegin = rEnd;
804 return OUString();
805}
806
807OUString URIHelper::removePassword(OUString const & rURI,
808 INetURLObject::EncodeMechanism eEncodeMechanism,
809 INetURLObject::DecodeMechanism eDecodeMechanism,
810 rtl_TextEncoding eCharset)
811{
812 INetURLObject aObj(rURI, eEncodeMechanism, eCharset);
813 return aObj.HasError() ?
814 rURI :
815 aObj.GetURLNoPass(eDecodeMechanism, eCharset);
816}
817
818OUString URIHelper::resolveIdnaHost(OUString const & url) {
819 css::uno::Reference<css::uri::XUriReference> uri(
820 css::uri::UriReferenceFactory::create(
822 ->parse(url));
823 if (!(uri.is() && uri->hasAuthority())) {
824 return url;
825 }
826 auto auth(uri->getAuthority());
827 if (auth.isEmpty())
828 return url;
829 sal_Int32 hostStart = auth.indexOf('@') + 1;
830 sal_Int32 hostEnd = auth.getLength();
831 while (hostEnd > hostStart && rtl::isAsciiDigit(auth[hostEnd - 1])) {
832 --hostEnd;
833 }
834 if (hostEnd > hostStart && auth[hostEnd - 1] == ':') {
835 --hostEnd;
836 } else {
837 hostEnd = auth.getLength();
838 }
839 auto asciiOnly = true;
840 for (auto i = hostStart; i != hostEnd; ++i) {
841 if (!rtl::isAscii(auth[i])) {
842 asciiOnly = false;
843 break;
844 }
845 }
846 if (asciiOnly) {
847 // Avoid icu::IDNA case normalization in purely non-IDNA domain names:
848 return url;
849 }
850 UErrorCode e = U_ZERO_ERROR;
851 std::unique_ptr<icu::IDNA> idna(
852 icu::IDNA::createUTS46Instance(
853 (UIDNA_USE_STD3_RULES | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_CHECK_CONTEXTO),
854 e));
855 if (U_FAILURE(e)) {
856 SAL_WARN("vcl.gdi", "icu::IDNA::createUTS46Instance " << e);
857 return url;
858 }
859 icu::UnicodeString ascii;
860 icu::IDNAInfo info;
861 idna->nameToASCII(
862 icu::UnicodeString(
863 reinterpret_cast<UChar const *>(auth.getStr() + hostStart),
864 hostEnd - hostStart),
865 ascii, info, e);
866 if (U_FAILURE(e) || info.hasErrors()) {
867 return url;
868 }
869 OUStringBuffer buf(uri->getScheme());
870 buf.append(OUString::Concat("://") + auth.subView(0, hostStart));
871 buf.append(
872 reinterpret_cast<sal_Unicode const *>(ascii.getBuffer()),
873 ascii.length());
874 buf.append(auth.subView(hostEnd) + uri->getPath());
875 if (uri->hasQuery()) {
876 buf.append("?" + uri->getQuery());
877 }
878 if (uri->hasFragment()) {
879 buf.append("#" + uri->getFragment());
880 }
881 return buf.makeStringAndClear();
882}
883
884/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
bool isAlphaNumeric(const OUString &rStr, sal_Int32 nPos) const
bool isLetterNumeric(const OUString &rStr, sal_Int32 nPos) const
bool isDigit(const OUString &rStr, sal_Int32 nPos) const
static bool isAtomChar(sal_uInt32 nChar)
OUString GetMainURL(DecodeMechanism eMechanism, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8) const
INetURLObject smartRel2Abs(OUString const &rTheRelURIRef, bool &rWasAbsolute, bool bIgnoreFragment=false, EncodeMechanism eMechanism=EncodeMechanism::WasEncoded, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8, bool bRelativeNonURIs=false, FSysStyle eStyle=FSysStyle::Detect) const
bool HasError() const
bool SetSmartURL(std::u16string_view rTheAbsURIRef, EncodeMechanism eMechanism=EncodeMechanism::WasEncoded, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8, FSysStyle eStyle=FSysStyle::Detect)
static INetProtocol CompareProtocolScheme(std::u16string_view aTheAbsURIRef)
static sal_uInt32 scanDomain(sal_Unicode const *&rBegin, sal_Unicode const *pEnd, bool bEager=true)
INetProtocol GetProtocol() const
OUString GetURLNoPass(DecodeMechanism eMechanism=DecodeMechanism::ToIUri, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8) const
void * p
sal_Int64 n
sal_uInt16 nPos
#define SAL_WARN(area, stream)
SVL_DLLPUBLIC OUString FindFirstURLInText(OUString const &rText, sal_Int32 &rBegin, sal_Int32 &rEnd, CharClass const &rCharClass, INetURLObject::EncodeMechanism eMechanism=INetURLObject::EncodeMechanism::WasEncoded, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8)
Definition: urihelper.cxx:448
SVL_DLLPUBLIC OUString FindFirstDOIInText(OUString const &rText, sal_Int32 &rBegin, sal_Int32 &rEnd, CharClass const &rCharClass)
Definition: urihelper.cxx:748
SVL_DLLPUBLIC Link< OUString *, bool > const & GetMaybeFileHdl()
Definition: urihelper.cxx:116
SVL_DLLPUBLIC OUString SmartRel2Abs(INetURLObject const &rTheBaseURIRef, OUString const &rTheRelURIRef, Link< OUString *, bool > const &rMaybeFileHdl=Link< OUString *, bool >(), bool bCheckFileExists=true, bool bIgnoreFragment=false, INetURLObject::EncodeMechanism eEncodeMechanism=INetURLObject::EncodeMechanism::WasEncoded, INetURLObject::DecodeMechanism eDecodeMechanism=INetURLObject::DecodeMechanism::ToIUri, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8, FSysStyle eStyle=FSysStyle::Detect)
@ATT Calling this function with defaulted arguments rMaybeFileHdl = Link() and bCheckFileExists = tru...
Definition: urihelper.cxx:56
SVL_DLLPUBLIC OUString removePassword(OUString const &rURI, INetURLObject::EncodeMechanism eEncodeMechanism, INetURLObject::DecodeMechanism eDecodeMechanism=INetURLObject::DecodeMechanism::ToIUri, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8)
Remove any password component from both absolute and relative URLs.
Definition: urihelper.cxx:807
SVL_DLLPUBLIC css::uno::Reference< css::uri::XUriReference > normalizedMakeRelative(css::uno::Reference< css::uno::XComponentContext > const &context, OUString const &baseUriReference, OUString const &uriReference)
Converts a URI reference to a relative one, ignoring certain differences (for example,...
Definition: urihelper.cxx:253
SVL_DLLPUBLIC OUString simpleNormalizedMakeRelative(OUString const &baseUriReference, OUString const &uriReference)
A variant of normalizedMakeRelative with a simplified interface.
Definition: urihelper.cxx:268
SVL_DLLPUBLIC void SetMaybeFileHdl(Link< OUString *, bool > const &rTheMaybeFileHdl)
Definition: urihelper.cxx:111
SVL_DLLPUBLIC OUString resolveIdnaHost(OUString const &url)
Resolve a URL's host component domain name in IDNA syntax to plain DNS syntax.
Definition: urihelper.cxx:818
bool normalize(sal_uInt16 &rDay, sal_uInt16 &rMonth, sal_Int16 &rYear)
Reference< XComponentContext > getProcessComponentContext()
int i
bool parse(OUString const &uri, SourceProviderScannerData *data)
HashMap_OWString_Interface aMap
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
INetProtocol
FSysStyle