LibreOffice Module vcl (master)  1
scrptrun.cxx
Go to the documentation of this file.
1 /*
2  *******************************************************************************
3  *
4  * Copyright (c) 1995-2013 International Business Machines Corporation and others
5  *
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy of
9  * this software and associated documentation files (the "Software"), to deal in
10  * the Software without restriction, including without limitation the rights to
11  * use, copy, modify, merge, publish, distribute, and/or sell copies of the
12  * Software, and to permit persons to whom the Software is furnished to do so,
13  * provided that the above copyright notice(s) and this permission notice appear
14  * in all copies of the Software and that both the above copyright notice(s) and
15  * this permission notice appear in supporting documentation.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
20  * NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
21  * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
22  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
24  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25  *
26  * Except as contained in this notice, the name of a copyright holder shall not be
27  * used in advertising or otherwise to promote the sale, use or other dealings in
28  * this Software without prior written authorization of the copyright holder.
29  *
30  *******************************************************************************
31  * file name: scrptrun.cpp
32  *
33  * created on: 10/17/2001
34  * created by: Eric R. Mader
35  */
41 #include <sal/config.h>
42 
43 #include <rtl/character.hxx>
44 #include <unicode/utypes.h>
45 #include <unicode/uscript.h>
46 
47 #include <scrptrun.h>
48 #include <algorithm>
49 
50 namespace {
51 
52 struct PairIndices
53 {
54  int8_t ma00[0xff];
55  int8_t ma20[0x7f];
56  int8_t ma30[0x7f];
57 
58  PairIndices()
59  {
60  std::fill_n(ma00, 0xff, -1);
61  std::fill_n(ma20, 0x7f, -1);
62  std::fill_n(ma30, 0x7f, -1);
63 
64  // characters in the range 0x0000 - 0x007e (inclusive)
65  // ascii paired punctuation
66  ma00[0x28] = 0;
67  ma00[0x29] = 1;
68  ma00[0x3c] = 2;
69  ma00[0x3e] = 3;
70  ma00[0x5b] = 4;
71  ma00[0x5d] = 5;
72  ma00[0x7b] = 6;
73  ma00[0x7d] = 7;
74  // guillemets
75  ma00[0xab] = 8;
76  ma00[0xbb] = 9;
77 
78  // characters in the range 0x2000 - 0x207e (inclusive)
79  // general punctuation
80  ma20[0x18] = 10;
81  ma20[0x19] = 11;
82  ma20[0x1c] = 12;
83  ma20[0x1d] = 13;
84  ma20[0x39] = 14;
85  ma20[0x3a] = 15;
86 
87  // characters in the range 0x3000 - 0x307e (inclusive)
88  // chinese paired punctuation
89  ma30[0x08] = 16;
90  ma30[0x09] = 17;
91  ma30[0x0a] = 18;
92  ma30[0x0b] = 19;
93  ma30[0x0c] = 20;
94  ma30[0x0d] = 21;
95  ma30[0x0e] = 22;
96  ma30[0x0f] = 23;
97  ma30[0x10] = 24;
98  ma30[0x11] = 25;
99  ma30[0x14] = 26;
100  ma30[0x15] = 27;
101  ma30[0x16] = 28;
102  ma30[0x17] = 29;
103  ma30[0x18] = 30;
104  ma30[0x19] = 31;
105  ma30[0x1a] = 32;
106  ma30[0x1b] = 33;
107  }
108 
109  int32_t getPairIndex(UChar32 ch) const
110  {
111  if (ch < 0xff)
112  return ma00[ch];
113  if (ch >= 0x2000 && ch < 0x207f)
114  return ma20[ch - 0x2000];
115  if (ch >= 0x3000 && ch < 0x307f)
116  return ma30[ch - 0x3000];
117  return -1;
118  }
119 
120 };
121 
122 // There are three Unicode script codes for Japanese text, but only one
123 // OpenType script tag, so we want to keep them in one run as splitting is
124 // pointless for the purpose of OpenType shaping.
125 UScriptCode getScript(UChar32 ch, UErrorCode* status)
126 {
127  UScriptCode script = uscript_getScript(ch, status);
128  if (U_FAILURE(*status))
129  return script;
130  if (script == USCRIPT_KATAKANA || script == USCRIPT_KATAKANA_OR_HIRAGANA)
131  return USCRIPT_HIRAGANA;
132  return script;
133 }
134 
135 }
136 
137 const PairIndices gPairIndices;
138 
139 
140 namespace vcl {
141 
142 const char ScriptRun::fgClassID=0;
143 
144 static bool sameScript(int32_t scriptOne, int32_t scriptTwo)
145 {
146  return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
147 }
148 
150 {
151  int32_t startSP = parenSP; // used to find the first new open character
152  UErrorCode error = U_ZERO_ERROR;
153 
154  // if we've fallen off the end of the text, we're done
155  if (scriptEnd >= charLimit) {
156  return false;
157  }
158 
159  scriptCode = USCRIPT_COMMON;
160 
162  UChar high = charArray[scriptEnd];
163  UChar32 ch = high;
164 
165  // if the character is a high surrogate and it's not the last one
166  // in the text, see if it's followed by a low surrogate
167  if (rtl::isHighSurrogate(high) && scriptEnd < charLimit - 1)
168  {
169  UChar low = charArray[scriptEnd + 1];
170 
171  // if it is followed by a low surrogate,
172  // consume it and form the full character
173  if (rtl::isLowSurrogate(low)) {
174  ch = rtl::combineSurrogates(high, low);
175  scriptEnd += 1;
176  }
177  }
178 
179  UScriptCode sc = getScript(ch, &error);
180  int32_t pairIndex = gPairIndices.getPairIndex(ch);
181 
182  // Paired character handling:
183 
184  // if it's an open character, push it onto the stack.
185  // if it's a close character, find the matching open on the
186  // stack, and use that script code. Any non-matching open
187  // characters above it on the stack will be popped.
188  if (pairIndex >= 0) {
189  if ((pairIndex & 1) == 0) {
190  ++parenSP;
191  int32_t nVecSize = parenStack.size();
192  if (parenSP == nVecSize)
193  parenStack.resize(nVecSize + 128);
194  parenStack[parenSP].pairIndex = pairIndex;
195  parenStack[parenSP].scriptCode = scriptCode;
196  } else if (parenSP >= 0) {
197  int32_t pi = pairIndex & ~1;
198 
199  while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
200  parenSP -= 1;
201  }
202 
203  if (parenSP < startSP) {
204  startSP = parenSP;
205  }
206 
207  if (parenSP >= 0) {
208  sc = parenStack[parenSP].scriptCode;
209  }
210  }
211  }
212 
213  if (sameScript(scriptCode, sc)) {
214  if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
215  scriptCode = sc;
216 
217  // now that we have a final script code, fix any open
218  // characters we pushed before we knew the script code.
219  while (startSP < parenSP) {
220  parenStack[++startSP].scriptCode = scriptCode;
221  }
222  }
223 
224  // if this character is a close paired character,
225  // pop it from the stack
226  if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
227  parenSP -= 1;
228  /* decrement startSP only if it is >= 0,
229  decrementing it unnecessarily will lead to memory corruption
230  while processing the above while block.
231  e.g. startSP = -4 , parenSP = -1
232  */
233  if (startSP >= 0) {
234  startSP -= 1;
235  }
236  }
237  } else {
238  // if the run broke on a surrogate pair,
239  // end it before the high surrogate
240  if (ch >= 0x10000) {
241  scriptEnd -= 1;
242  }
243 
244  break;
245  }
246  }
247 
248  return true;
249 }
250 
251 }
UBool next()
Definition: scrptrun.cxx:149
int32_t scriptEnd
Definition: scrptrun.h:103
sal_Int16 script
const UChar * charArray
Definition: scrptrun.h:100
int32_t scriptStart
Definition: scrptrun.h:102
std::vector< ParenStackEntry > parenStack
Definition: scrptrun.h:106
int32_t parenSP
Definition: scrptrun.h:107
int32_t charLimit
Definition: scrptrun.h:99
const PairIndices gPairIndices
Definition: scrptrun.cxx:137
UScriptCode scriptCode
Definition: scrptrun.h:104
static bool sameScript(int32_t scriptOne, int32_t scriptTwo)
Definition: scrptrun.cxx:144
static const char fgClassID
The address of this static class variable serves as this class's ID for ICU "poor man's RTTI"...
Definition: scrptrun.h:113