LibreOffice Module i18npool (master) 1
gendict.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <stdio.h>
21#include <string.h>
22#include <stdlib.h>
23#include <errno.h>
24#include <sal/main.h>
25#include <sal/types.h>
26#include <rtl/ustring.hxx>
27#include <osl/diagnose.h>
28#include <vector>
29
30using std::vector;
31
32
33// For iOS, where we must strive for a minimal executable size, we
34// keep the data produced by this utility not as large const tables in
35// source code but instead as separate data files, to be bundled with
36// an app, and mmapped in at run time.
37
38// To test this easier on a desktop OS, just make sure
39// DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
40
41#ifdef DICT_JA_ZH_IN_DATAFILE
42static sal_Int64 dataAreaOffset = 0;
43static sal_Int64 lenArrayOffset = 0;
44static sal_Int64 index1Offset = 0;
45static sal_Int64 index2Offset = 0;
46static sal_Int64 existMarkOffset = 0;
47#endif
48
49/* Utility gendict:
50
51 "BreakIterator_CJK provides input string caching and dictionary searching for
52 longest matching. You can provide a sorted dictionary (the encoding must be
53 UTF-8) by creating the following file:
54 i18npool/source/breakiterator/data/<language>.dict.
55
56 The utility gendict will convert the file to C code, which will be compiled
57 into a shared library for dynamic loading.
58
59 All dictionary searching and loading is performed in the xdictionary class.
60 The only thing you need to do is to derive your class from BreakIterator_CJK
61 and create an instance of the xdictionary with the language name and
62 pass it to the parent class." (from https://wiki.documentfoundation.org/
63 Documentation/DevGuide/Office_Development#Implementing_a_New_Locale - 27/01/2011)
64*/
65
66// C-standard guarantees that static variables are automatically initialized to 0
67static sal_uInt8 exists[0x2000];
68static sal_uInt32 charArray[0x10000];
69
70static void set_exists(sal_uInt32 index)
71{
72 exists[index>>3] |= 1 << (index & 0x07);
73}
74
75static void printIncludes(FILE* source_fp)
76{
77#ifndef DICT_JA_ZH_IN_DATAFILE
78 fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
79 fputs("#include <sal/types.h>\n\n", source_fp);
80#else
81 (void) source_fp;
82#endif
83}
84
85static void printFunctions(FILE* source_fp, const char *lang)
86{
87#ifndef DICT_JA_ZH_IN_DATAFILE
88 fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
89 fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
90 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
91 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
92 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
93 fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
94 fputs ("#else\n", source_fp);
95 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
96 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
97 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
98 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
99 fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
100 fputs ("#endif\n", source_fp);
101#else
102 (void) source_fp;
103 (void) lang;
104#endif
105}
106
107static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
108{
109 // generate main dict. data array
110#ifndef DICT_JA_ZH_IN_DATAFILE
111 fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
112#else
113 dataAreaOffset = ftell(source_fp);
114#endif
115 char str[1024];
116 sal_uInt32 lenArrayCurr = 0;
117 sal_Unicode current = 0;
118
119 while (fgets(str, 1024, dictionary_fp)) {
120 // input file is in UTF-8 encoding
121 // don't convert last new line character to Ostr.
122 OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
123
124 const sal_Int32 len = Ostr.getLength();
125
126 sal_Int32 i=0;
127 Ostr.iterateCodePoints(&i);
128 if (len == i)
129 continue; // skip one character word
130
131 if (Ostr[0] != current) {
132 OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
133 current = Ostr[0];
134 charArray[current] = lenArray.size();
135 }
136
137 lenArray.push_back(lenArrayCurr);
138
139 set_exists(Ostr[0]);
140 // first character is stored in charArray, so start from second
141 for (i = 1; i < len; i++, lenArrayCurr++) {
142 set_exists(Ostr[i]);
143#ifndef DICT_JA_ZH_IN_DATAFILE
144 fprintf(source_fp, "0x%04x, ", Ostr[i]);
145 if ((lenArrayCurr & 0x0f) == 0x0f)
146 fputs("\n\t", source_fp);
147#else
148 sal_Unicode x = Ostr[i];
149 fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
150#endif
151 }
152 }
153 charArray[current+1] = lenArray.size();
154 lenArray.push_back( lenArrayCurr ); // store last ending pointer
155#ifndef DICT_JA_ZH_IN_DATAFILE
156 fputs("\n};\n", source_fp);
157#endif
158}
159
160static void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
161{
162#ifndef DICT_JA_ZH_IN_DATAFILE
163 fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
164 fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
165#else
166 lenArrayOffset = ftell(source_fp);
167 sal_uInt32 zero(0);
168 fwrite(&zero, sizeof(zero), 1, source_fp);
169#endif
170 for (size_t k = 0; k < lenArray.size(); k++)
171 {
172 if( !(k & 0xf) )
173 fputs("\n\t", source_fp);
174
175#ifndef DICT_JA_ZH_IN_DATAFILE
176 fprintf(source_fp, "0x%" SAL_PRIxUINT32 ", ", lenArray[k]);
177#else
178 fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
179#endif
180 }
181
182#ifndef DICT_JA_ZH_IN_DATAFILE
183 fputs("\n};\n", source_fp );
184#endif
185}
186
187/* FIXME?: what happens if in every range i there is at least one charArray != 0
188 => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
189 => then in index2, the last range will be ignored incorrectly */
190static void printIndex1(FILE *source_fp, sal_Int16 *set)
191{
192#ifndef DICT_JA_ZH_IN_DATAFILE
193 fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
194#else
195 index1Offset = ftell(source_fp);
196#endif
197
198 sal_Int16 count = 0;
199 for (sal_Int32 i = 0; i < 0x100; i++) {
200 sal_Int32 j = 0;
201 while( j < 0x100 && charArray[(i<<8) + j] == 0)
202 j++;
203
204 set[i] = (j < 0x100 ? count++ : 0xff);
205#ifndef DICT_JA_ZH_IN_DATAFILE
206 fprintf(source_fp, "0x%02x, ", set[i]);
207 if ((i & 0x0f) == 0x0f)
208 fputs ("\n\t", source_fp);
209#else
210 fwrite(&set[i], sizeof(set[i]), 1, source_fp);
211#endif
212 }
213
214#ifndef DICT_JA_ZH_IN_DATAFILE
215 fputs("};\n", source_fp);
216#endif
217}
218
219static void printIndex2(FILE *source_fp, sal_Int16 const *set)
220{
221#ifndef DICT_JA_ZH_IN_DATAFILE
222 fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
223#else
224 index2Offset = ftell(source_fp);
225#endif
226 sal_Int32 prev = 0;
227 for (sal_Int32 i = 0; i < 0x100; i++) {
228 if (set[i] != 0xff) {
229 for (sal_Int32 j = 0; j < 0x100; j++) {
230 sal_Int32 k = (i<<8) + j;
231 if (prev != 0 )
232 while( k < 0x10000 && charArray[k] == 0 )
233 k++;
234
235 prev = charArray[(i<<8) + j];
236#ifndef DICT_JA_ZH_IN_DATAFILE
237 fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
238 if ((j & 0x0f) == 0x0f)
239 fputs ("\n\t", source_fp);
240#else
241 sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
242 fwrite(&n, sizeof(n), 1, source_fp);
243#endif
244 }
245#ifndef DICT_JA_ZH_IN_DATAFILE
246 fputs ("\n\t", source_fp);
247#endif
248 }
249 }
250#ifndef DICT_JA_ZH_IN_DATAFILE
251 fputs ("\n};\n", source_fp);
252#endif
253}
254
255/* Generates a bitmask for the existence of sal_Unicode values in dictionary;
256 it packs 8 sal_Bool values in 1 sal_uInt8 */
257static void printExistsMask(FILE *source_fp)
258{
259#ifndef DICT_JA_ZH_IN_DATAFILE
260 fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
261#else
262 existMarkOffset = ftell(source_fp);
263#endif
264 for (unsigned int i = 0; i < 0x2000; i++)
265 {
266#ifndef DICT_JA_ZH_IN_DATAFILE
267 fprintf(source_fp, "0x%02x, ", exists[i]);
268 if ( (i & 0xf) == 0xf )
269 fputs("\n\t", source_fp);
270#else
271 fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
272#endif
273 }
274
275#ifndef DICT_JA_ZH_IN_DATAFILE
276 fputs("\n};\n", source_fp);
277#endif
278}
279
281{
282 FILE *dictionary_fp, *source_fp;
283
284 if (argc == 1 || argc > 4)
285 {
286 fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
287 exit(-1);
288 }
289
290 dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
291 if (dictionary_fp == nullptr)
292 {
293 fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
294 exit(1);
295 }
296
297 if(argc == 2)
298 source_fp = stdout;
299 else
300 {
301 // create the C source file to write
302 source_fp = fopen(argv[2], "wb");
303 if (source_fp == nullptr) {
304 fclose(dictionary_fp);
305 fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
306 exit(1);
307 }
308 }
309
310 vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
311 sal_Int16 set[0x100];
312
313 printIncludes(source_fp);
314#ifndef DICT_JA_ZH_IN_DATAFILE
315 fputs("extern \"C\" {\n", source_fp);
316#endif
317 printDataArea(dictionary_fp, source_fp, lenArray);
318 printLenArray(source_fp, lenArray);
319 printIndex1(source_fp, set);
320 printIndex2(source_fp, set);
321 printExistsMask(source_fp);
322 printFunctions(source_fp, argv[3]);
323#ifndef DICT_JA_ZH_IN_DATAFILE
324 fputs("}\n", source_fp);
325#else
326 // Put pointers to the tables at the end of the file...
327 fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
328 fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
329 fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
330 fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
331 fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
332#endif
333
334 fclose(dictionary_fp);
335 fclose(source_fp);
336
337 return 0;
338}
339
340/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
float x
static void printLenArray(FILE *source_fp, const vector< sal_uInt32 > &lenArray)
Definition: gendict.cxx:160
static void printIndex1(FILE *source_fp, sal_Int16 *set)
Definition: gendict.cxx:190
static sal_uInt8 exists[0x2000]
Definition: gendict.cxx:67
static void set_exists(sal_uInt32 index)
Definition: gendict.cxx:70
static void printExistsMask(FILE *source_fp)
Definition: gendict.cxx:257
static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector< sal_uInt32 > &lenArray)
Definition: gendict.cxx:107
static void printIncludes(FILE *source_fp)
Definition: gendict.cxx:75
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
Definition: gendict.cxx:280
static void printFunctions(FILE *source_fp, const char *lang)
Definition: gendict.cxx:85
static sal_uInt32 charArray[0x10000]
Definition: gendict.cxx:68
static void printIndex2(FILE *source_fp, sal_Int16 const *set)
Definition: gendict.cxx:219
sal_Int64 n
void set(css::uno::UnoInterfaceReference const &value)
int i
index
unsigned char sal_uInt8
sal_uInt16 sal_Unicode