LibreOffice Module i18npool (master)  1
gendict.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <errno.h>
24 #include <sal/main.h>
25 #include <sal/types.h>
26 #include <rtl/ustring.hxx>
27 #include <osl/diagnose.h>
28 #include <vector>
29 
30 using std::vector;
31 
32 
33 // For iOS, where we must strive for a minimal executable size, we
34 // keep the data produced by this utility not as large const tables in
35 // source code but instead as separate data files, to be bundled with
36 // an app, and mmapped in at run time.
37 
38 // To test this easier on a desktop OS, just make sure
39 // DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
40 
41 #ifdef DICT_JA_ZH_IN_DATAFILE
42 static sal_Int64 dataAreaOffset = 0;
43 static sal_Int64 lenArrayOffset = 0;
44 static sal_Int64 index1Offset = 0;
45 static sal_Int64 index2Offset = 0;
46 static sal_Int64 existMarkOffset = 0;
47 #endif
48 
49 /* Utility gendict:
50 
51  "BreakIterator_CJK provides input string caching and dictionary searching for
52  longest matching. You can provide a sorted dictionary (the encoding must be
53  UTF-8) by creating the following file:
54  i18npool/source/breakiterator/data/<language>.dict.
55 
56  The utility gendict will convert the file to C code, which will be compiled
57  into a shared library for dynamic loading.
58 
59  All dictionary searching and loading is performed in the xdictionary class.
60  The only thing you need to do is to derive your class from BreakIterator_CJK
61  and create an instance of the xdictionary with the language name and
62  pass it to the parent class." (from https://wiki.documentfoundation.org/
63  Documentation/DevGuide/Office_Development#Implementing_a_New_Locale - 27/01/2011)
64 */
65 
66 // C-standard guarantees that static variables are automatically initialized to 0
67 static sal_uInt8 exists[0x2000];
68 static sal_uInt32 charArray[0x10000];
69 
70 static void set_exists(sal_uInt32 index)
71 {
72  exists[index>>3] |= 1 << (index & 0x07);
73 }
74 
75 static void printIncludes(FILE* source_fp)
76 {
77 #ifndef DICT_JA_ZH_IN_DATAFILE
78  fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
79  fputs("#include <sal/types.h>\n\n", source_fp);
80 #else
81  (void) source_fp;
82 #endif
83 }
84 
85 static void printFunctions(FILE* source_fp, const char *lang)
86 {
87 #ifndef DICT_JA_ZH_IN_DATAFILE
88  fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
89  fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
90  fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
91  fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
92  fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
93  fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
94  fputs ("#else\n", source_fp);
95  fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
96  fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
97  fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
98  fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
99  fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
100  fputs ("#endif\n", source_fp);
101 #else
102  (void) source_fp;
103  (void) lang;
104 #endif
105 }
106 
107 static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
108 {
109  // generate main dict. data array
110 #ifndef DICT_JA_ZH_IN_DATAFILE
111  fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
112 #else
113  dataAreaOffset = ftell(source_fp);
114 #endif
115  char str[1024];
116  sal_uInt32 lenArrayCurr = 0;
117  sal_Unicode current = 0;
118 
119  while (fgets(str, 1024, dictionary_fp)) {
120  // input file is in UTF-8 encoding
121  // don't convert last new line character to Ostr.
122  OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
123 
124  const sal_Int32 len = Ostr.getLength();
125 
126  sal_Int32 i=0;
127  Ostr.iterateCodePoints(&i);
128  if (len == i)
129  continue; // skip one character word
130 
131  if (Ostr[0] != current) {
132  OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
133  current = Ostr[0];
134  charArray[current] = lenArray.size();
135  }
136 
137  lenArray.push_back(lenArrayCurr);
138 
139  set_exists(Ostr[0]);
140  // first character is stored in charArray, so start from second
141  for (i = 1; i < len; i++, lenArrayCurr++) {
142  set_exists(Ostr[i]);
143 #ifndef DICT_JA_ZH_IN_DATAFILE
144  fprintf(source_fp, "0x%04x, ", Ostr[i]);
145  if ((lenArrayCurr & 0x0f) == 0x0f)
146  fputs("\n\t", source_fp);
147 #else
148  sal_Unicode x = Ostr[i];
149  fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
150 #endif
151  }
152  }
153  charArray[current+1] = lenArray.size();
154  lenArray.push_back( lenArrayCurr ); // store last ending pointer
155 #ifndef DICT_JA_ZH_IN_DATAFILE
156  fputs("\n};\n", source_fp);
157 #endif
158 }
159 
160 static void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
161 {
162 #ifndef DICT_JA_ZH_IN_DATAFILE
163  fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
164  fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
165 #else
166  lenArrayOffset = ftell(source_fp);
167  sal_uInt32 zero(0);
168  fwrite(&zero, sizeof(zero), 1, source_fp);
169 #endif
170  for (size_t k = 0; k < lenArray.size(); k++)
171  {
172  if( !(k & 0xf) )
173  fputs("\n\t", source_fp);
174 
175 #ifndef DICT_JA_ZH_IN_DATAFILE
176  fprintf(source_fp, "0x%" SAL_PRIxUINT32 ", ", lenArray[k]);
177 #else
178  fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
179 #endif
180  }
181 
182 #ifndef DICT_JA_ZH_IN_DATAFILE
183  fputs("\n};\n", source_fp );
184 #endif
185 }
186 
187 /* FIXME?: what happens if in every range i there is at least one charArray != 0
188  => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
189  => then in index2, the last range will be ignored incorrectly */
190 static void printIndex1(FILE *source_fp, sal_Int16 *set)
191 {
192 #ifndef DICT_JA_ZH_IN_DATAFILE
193  fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
194 #else
195  index1Offset = ftell(source_fp);
196 #endif
197 
198  sal_Int16 count = 0;
199  for (sal_Int32 i = 0; i < 0x100; i++) {
200  sal_Int32 j = 0;
201  while( j < 0x100 && charArray[(i<<8) + j] == 0)
202  j++;
203 
204  set[i] = (j < 0x100 ? count++ : 0xff);
205 #ifndef DICT_JA_ZH_IN_DATAFILE
206  fprintf(source_fp, "0x%02x, ", set[i]);
207  if ((i & 0x0f) == 0x0f)
208  fputs ("\n\t", source_fp);
209 #else
210  fwrite(&set[i], sizeof(set[i]), 1, source_fp);
211 #endif
212  }
213 
214 #ifndef DICT_JA_ZH_IN_DATAFILE
215  fputs("};\n", source_fp);
216 #endif
217 }
218 
219 static void printIndex2(FILE *source_fp, sal_Int16 const *set)
220 {
221 #ifndef DICT_JA_ZH_IN_DATAFILE
222  fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
223 #else
224  index2Offset = ftell(source_fp);
225 #endif
226  sal_Int32 prev = 0;
227  for (sal_Int32 i = 0; i < 0x100; i++) {
228  if (set[i] != 0xff) {
229  for (sal_Int32 j = 0; j < 0x100; j++) {
230  sal_Int32 k = (i<<8) + j;
231  if (prev != 0 )
232  while( k < 0x10000 && charArray[k] == 0 )
233  k++;
234 
235  prev = charArray[(i<<8) + j];
236 #ifndef DICT_JA_ZH_IN_DATAFILE
237  fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
238  if ((j & 0x0f) == 0x0f)
239  fputs ("\n\t", source_fp);
240 #else
241  sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
242  fwrite(&n, sizeof(n), 1, source_fp);
243 #endif
244  }
245 #ifndef DICT_JA_ZH_IN_DATAFILE
246  fputs ("\n\t", source_fp);
247 #endif
248  }
249  }
250 #ifndef DICT_JA_ZH_IN_DATAFILE
251  fputs ("\n};\n", source_fp);
252 #endif
253 }
254 
255 /* Generates a bitmask for the existence of sal_Unicode values in dictionary;
256  it packs 8 sal_Bool values in 1 sal_uInt8 */
257 static void printExistsMask(FILE *source_fp)
258 {
259 #ifndef DICT_JA_ZH_IN_DATAFILE
260  fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
261 #else
262  existMarkOffset = ftell(source_fp);
263 #endif
264  for (unsigned int i = 0; i < 0x2000; i++)
265  {
266 #ifndef DICT_JA_ZH_IN_DATAFILE
267  fprintf(source_fp, "0x%02x, ", exists[i]);
268  if ( (i & 0xf) == 0xf )
269  fputs("\n\t", source_fp);
270 #else
271  fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
272 #endif
273  }
274 
275 #ifndef DICT_JA_ZH_IN_DATAFILE
276  fputs("\n};\n", source_fp);
277 #endif
278 }
279 
281 {
282  FILE *dictionary_fp, *source_fp;
283 
284  if (argc == 1 || argc > 4)
285  {
286  fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
287  exit(-1);
288  }
289 
290  dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
291  if (dictionary_fp == nullptr)
292  {
293  fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
294  exit(1);
295  }
296 
297  if(argc == 2)
298  source_fp = stdout;
299  else
300  {
301  // create the C source file to write
302  source_fp = fopen(argv[2], "wb");
303  if (source_fp == nullptr) {
304  fclose(dictionary_fp);
305  fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
306  exit(1);
307  }
308  }
309 
310  vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
311  sal_Int16 set[0x100];
312 
313  printIncludes(source_fp);
314 #ifndef DICT_JA_ZH_IN_DATAFILE
315  fputs("extern \"C\" {\n", source_fp);
316 #endif
317  printDataArea(dictionary_fp, source_fp, lenArray);
318  printLenArray(source_fp, lenArray);
319  printIndex1(source_fp, set);
320  printIndex2(source_fp, set);
321  printExistsMask(source_fp);
322  printFunctions(source_fp, argv[3]);
323 #ifndef DICT_JA_ZH_IN_DATAFILE
324  fputs("}\n", source_fp);
325 #else
326  // Put pointers to the tables at the end of the file...
327  fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
328  fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
329  fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
330  fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
331  fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
332 #endif
333 
334  fclose(dictionary_fp);
335  fclose(source_fp);
336 
337  return 0;
338 }
339 
340 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static void printFunctions(FILE *source_fp, const char *lang)
Definition: gendict.cxx:85
static void printIndex2(FILE *source_fp, sal_Int16 const *set)
Definition: gendict.cxx:219
sal_Int64 n
float x
sal_uInt16 sal_Unicode
static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector< sal_uInt32 > &lenArray)
Definition: gendict.cxx:107
static sal_uInt32 charArray[0x10000]
Definition: gendict.cxx:68
static void printLenArray(FILE *source_fp, const vector< sal_uInt32 > &lenArray)
Definition: gendict.cxx:160
int i
static void set_exists(sal_uInt32 index)
Definition: gendict.cxx:70
static void printIndex1(FILE *source_fp, sal_Int16 *set)
Definition: gendict.cxx:190
static void printIncludes(FILE *source_fp)
Definition: gendict.cxx:75
unsigned char sal_uInt8
static void printExistsMask(FILE *source_fp)
Definition: gendict.cxx:257
static sal_uInt8 exists[0x2000]
Definition: gendict.cxx:67
void set(css::uno::UnoInterfaceReference const &value)
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
Definition: gendict.cxx:280
typedef void(CALLTYPE *GetFuncDataPtr)(sal_uInt16 &nNo