ucb/html/regexp_8cxx_source.html

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*

 * This file is part of the LibreOffice project.

 *

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

 *

 * This file incorporates work covered by the following license notice:

 *

 *   Licensed to the Apache Software Foundation (ASF) under one or more

 *   contributor license agreements. See the NOTICE file distributed

 *   with this work for additional information regarding copyright

 *   ownership. The ASF licenses this file to you under the Apache

 *   License, Version 2.0 (the "License"); you may not use this file

 *   except in compliance with the License. You may obtain a copy of

 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .

 */


#include <regexp.hxx>


#include <cstddef>


#include <osl/diagnose.h>

#include <com/sun/star/lang/IllegalArgumentException.hpp>

#include <rtl/character.hxx>

#include <rtl/ustrbuf.hxx>

#include <rtl/ustring.hxx>

#include <utility>


using namespace com::sun::star;

using namespace ucb_impl;


//  Regexp


inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix,

                      bool bTheEmptyDomain, OUString aTheInfix,

                      bool bTheTranslation,

                      OUString aTheReversePrefix):

    m_eKind(eTheKind),

    m_aPrefix(std::move(aThePrefix)),

    m_aInfix(std::move(aTheInfix)),

    m_aReversePrefix(std::move(aTheReversePrefix)),

    m_bEmptyDomain(bTheEmptyDomain),

    m_bTranslation(bTheTranslation)

{

    OSL_ASSERT(m_eKind == KIND_DOMAIN

               || (!m_bEmptyDomain && m_aInfix.isEmpty()));

    OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());

}


namespace {


bool matchStringIgnoreCase(sal_Unicode const ** pBegin,

                           sal_Unicode const * pEnd,

                           OUString const & rString)

{

    sal_Unicode const * p = *pBegin;


    sal_Unicode const * q = rString.getStr();

    sal_Unicode const * qEnd = q + rString.getLength();


    if (pEnd - p < qEnd - q)

        return false;


    while (q != qEnd)

    {

        if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)

            return false;

    }


    *pBegin = p;

    return true;

}


}


bool Regexp::matches(OUString const & rString) const

{

    sal_Unicode const * pBegin = rString.getStr();

    sal_Unicode const * pEnd = pBegin + rString.getLength();


    bool bMatches = false;


    sal_Unicode const * p = pBegin;

    if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))

    {

        switch (m_eKind)

        {

            case KIND_PREFIX:

                bMatches = true;

                break;


            case KIND_AUTHORITY:

                bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';

                break;


            case KIND_DOMAIN:

                if (!m_bEmptyDomain)

                {

                    if (p == pEnd || *p == '/' || *p == '?' || *p == '#')

                        break;

                    ++p;

                }

                for (;;)

                {

                    sal_Unicode const * q = p;

                    if (matchStringIgnoreCase(&q, pEnd, m_aInfix)

                        && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))

                    {

                        bMatches = true;

                        break;

                    }


                    if (p == pEnd)

                        break;


                    sal_Unicode c = *p++;

                    if (c == '/' || c == '?' || c == '#')

                        break;

                }

                break;

        }

    }


    return bMatches;

}


namespace {


bool isScheme(OUString const & rString, bool bColon)

{

    // Return true if rString matches <scheme> (plus a trailing ":" if bColon

    // is true) from RFC 2396:

    sal_Unicode const * p = rString.getStr();

    sal_Unicode const * pEnd = p + rString.getLength();

    if (p != pEnd && rtl::isAsciiAlpha(*p))

        for (++p;;)

        {

            if (p == pEnd)

                return !bColon;

            sal_Unicode c = *p++;

            if (!(rtl::isAsciiAlphanumeric(c)

                  || c == '+' || c == '-' || c == '.'))

                return bColon && c == ':' && p == pEnd;

        }

    return false;

}


void appendStringLiteral(OUStringBuffer * pBuffer,

                         OUString const & rString)

{

    OSL_ASSERT(pBuffer);


    pBuffer->append('"');

    sal_Unicode const * p = rString.getStr();

    sal_Unicode const * pEnd = p + rString.getLength();

    while (p != pEnd)

    {

        sal_Unicode c = *p++;

        if (c == '"' || c == '\\')

            pBuffer->append('\\');

        pBuffer->append(c);

    }

    pBuffer->append('"');

}


}


OUString Regexp::getRegexp() const

{

    if (m_bTranslation)

    {

        OUStringBuffer aBuffer;

        if (!m_aPrefix.isEmpty())

            appendStringLiteral(&aBuffer, m_aPrefix);

        switch (m_eKind)

        {

            case KIND_PREFIX:

                aBuffer.append("(.*)");

                break;


            case KIND_AUTHORITY:

                aBuffer.append("(([/?#].*)?)");

                break;


            case KIND_DOMAIN:

                aBuffer.append("([^/?#]" + OUStringChar(sal_Unicode(m_bEmptyDomain ? '*' : '+')));

                if (!m_aInfix.isEmpty())

                    appendStringLiteral(&aBuffer, m_aInfix);

                aBuffer.append("([/?#].*)?)");

                break;

        }

        aBuffer.append("->");

        if (!m_aReversePrefix.isEmpty())

            appendStringLiteral(&aBuffer, m_aReversePrefix);

        aBuffer.append("\\1");

        return aBuffer.makeStringAndClear();

    }

    else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))

        return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);

    else

    {

        OUStringBuffer aBuffer;

        if (!m_aPrefix.isEmpty())

            appendStringLiteral(&aBuffer, m_aPrefix);

        switch (m_eKind)

        {

            case KIND_PREFIX:

                aBuffer.append(".*");

                break;


            case KIND_AUTHORITY:

                aBuffer.append("([/?#].*)?");

                break;


            case KIND_DOMAIN:

                aBuffer.append("[^/?#]" + OUStringChar( m_bEmptyDomain ? '*' : '+' ));

                if (!m_aInfix.isEmpty())

                    appendStringLiteral(&aBuffer, m_aInfix);

                aBuffer.append("([/?#].*)?");

                break;

        }

        return aBuffer.makeStringAndClear();

    }

}


namespace {


bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,

                 char const * pString, size_t nStringLength)

{

    sal_Unicode const * p = *pBegin;


    unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);

    unsigned char const * qEnd = q + nStringLength;


    if (pEnd - p < qEnd - q)

        return false;


    while (q != qEnd)

    {

        sal_Unicode c1 = *p++;

        sal_Unicode c2 = *q++;

        if (c1 != c2)

            return false;

    }


    *pBegin = p;

    return true;

}


bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,

                       OUString * pString)

{

    sal_Unicode const * p = *pBegin;


    if (p == pEnd || *p++ != '"')

        return false;


    OUStringBuffer aBuffer;

    for (;;)

    {

        if (p == pEnd)

            return false;

        sal_Unicode c = *p++;

        if (c == '"')

            break;

        if (c == '\\')

        {

            if (p == pEnd)

                return false;

            c = *p++;

            if (c != '"' && c != '\\')

                return false;

        }

        aBuffer.append(c);

    }


    *pBegin = p;

    *pString = aBuffer.makeStringAndClear();

    return true;

}


}


Regexp Regexp::parse(OUString const & rRegexp)

{

    // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'

    // where <scheme> is as defined in RFC 2396:

    if (isScheme(rRegexp, false))

        return Regexp(Regexp::KIND_PREFIX,

                      rRegexp + ":",

                      false,

                      OUString(),

                      false,

                      OUString());


    sal_Unicode const * p = rRegexp.getStr();

    sal_Unicode const * pEnd = p + rRegexp.getLength();


    OUString aPrefix;

    scanStringLiteral(&p, pEnd, &aPrefix);


    if (p == pEnd)

        throw lang::IllegalArgumentException();


    // This and the matchString() calls below are some of the few places where

    // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.

    // (c.f. https://gerrit.libreoffice.org/3117)

    if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))

    {

        if (p != pEnd)

            throw lang::IllegalArgumentException();


        return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),

                      false, OUString());

    }

    else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))

    {

        OUString aReversePrefix;

        scanStringLiteral(&p, pEnd, &aReversePrefix);


        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))

            || p != pEnd)

            throw lang::IllegalArgumentException();


        return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),

                      true, aReversePrefix);

    }

    else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))

    {

        if (p != pEnd)

            throw lang::IllegalArgumentException();


        return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),

                      false, OUString());

    }

    else if (matchString(&p, pEnd,

                         RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))

    {

        OUString aReversePrefix;

        if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)

              && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))

              && p == pEnd))

            throw lang::IllegalArgumentException();


        return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),

                      true, aReversePrefix);

    }

    else

    {

        bool bOpen = false;

        if (p != pEnd && *p == '(')

        {

            ++p;

            bOpen = true;

        }


        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))

            throw lang::IllegalArgumentException();


        if (p == pEnd || (*p != '*' && *p != '+'))

            throw lang::IllegalArgumentException();

        bool bEmptyDomain = *p++ == '*';


        OUString aInfix;

        scanStringLiteral(&p, pEnd, &aInfix);


        if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))

            throw lang::IllegalArgumentException();


        OUString aReversePrefix;

        if (bOpen

            && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))

                 && scanStringLiteral(&p, pEnd, &aReversePrefix)

                 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))

            throw lang::IllegalArgumentException();


        if (p != pEnd)

            throw lang::IllegalArgumentException();


        return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,

                      bOpen, aReversePrefix);

    }

}


/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

ucb_impl::Regexp
Definition: regexp.hxx:28

ucb_impl::Regexp::Kind
Kind
Definition: regexp.hxx:31

ucb_impl::Regexp::KIND_DOMAIN
@ KIND_DOMAIN
Definition: regexp.hxx:34

ucb_impl::Regexp::KIND_PREFIX
@ KIND_PREFIX
Definition: regexp.hxx:32

ucb_impl::Regexp::KIND_AUTHORITY
@ KIND_AUTHORITY
Definition: regexp.hxx:33

ucb_impl::Regexp::m_bEmptyDomain
bool m_bEmptyDomain
Definition: regexp.hxx:55

ucb_impl::Regexp::m_aReversePrefix
OUString m_aReversePrefix
Definition: regexp.hxx:54

ucb_impl::Regexp::m_aInfix
OUString m_aInfix
Definition: regexp.hxx:53

ucb_impl::Regexp::getRegexp
OUString getRegexp() const
Definition: regexp.cxx:174

ucb_impl::Regexp::m_aPrefix
OUString m_aPrefix
Definition: regexp.hxx:52

ucb_impl::Regexp::Regexp
Regexp(Kind eTheKind, OUString aThePrefix, bool bTheEmptyDomain, OUString aTheInfix, bool bTheTranslation, OUString aTheReversePrefix)
Definition: regexp.cxx:38

ucb_impl::Regexp::parse
static Regexp parse(OUString const &rRegexp)
Definition: regexp.cxx:292

ucb_impl::Regexp::matches
bool matches(OUString const &rString) const
Definition: regexp.cxx:81

ucb_impl::Regexp::m_eKind
Kind m_eKind
Definition: regexp.hxx:51

ucb_impl::Regexp::m_bTranslation
bool m_bTranslation
Definition: regexp.hxx:56

p
void * p

com::sun::star

std

ucb_impl
Definition: regexp.hxx:25

regexp.hxx

sal_Unicode
sal_uInt16 sal_Unicode

aBuffer
std::unique_ptr< char[]> aBuffer