LibreOffice Module lingucomponent (master) 1
simpleguesser.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
30#include <string.h>
31
32#ifdef SYSTEM_LIBEXTTEXTCAT
33#include <libexttextcat/textcat.h>
34#include <libexttextcat/common.h>
35#include <libexttextcat/constants.h>
36#include <libexttextcat/fingerprint.h>
37#else
38#include <textcat.h>
39#include <common.h>
40#include <constants.h>
41#include <fingerprint.h>
42#endif
43
44#include <sal/types.h>
45
46#include<rtl/character.hxx>
47#include "simpleguesser.hxx"
48
49static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2){
50 size_t i;
51 int ret = 0;
52
53 size_t min = s1.length();
54 if (min > s2.length())
55 min = s2.length();
56
57 for(i = 0; i < min && s2[i] && s1[i] && !ret; i++){
58 ret = rtl::toAsciiUpperCase(static_cast<unsigned char>(s1[i]))
59 - rtl::toAsciiUpperCase(static_cast<unsigned char>(s2[i]));
60 if(s1[i] == '.' || s2[i] == '.') {ret = 0;} //. is a neutral character
61 }
62 return ret;
63 }
64
65namespace {
66
70typedef struct textcat_t{
71
72 void **fprint;
73 char *fprint_disable;
74 uint4 size;
75 uint4 maxsize;
76
77 char output[MAXOUTPUTSIZE];
78
79} textcat_t;
80// end of the 3 structs
81
82}
83
85{
86 h = nullptr;
87}
88
90 // Check for self-assignment!
91 if (this == &sg) // Same object?
92 return *this; // Yes, so skip assignment, and just return *this.
93
94 if(h){textcat_Done(h);}
95 h = sg.h;
96 return *this;
97}
98
100{
101 if(h){textcat_Done(h);}
102}
103
107std::vector<Guess> SimpleGuesser::GuessLanguage(const char* text)
108{
109 std::vector<Guess> guesses;
110
111 if (!h)
112 return guesses;
113
114 int len = strlen(text);
115
118
119 const char *guess_list = textcat_Classify(h, text, len);
120
121 if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0)
122 return guesses;
123
124 int current_pointer = 0;
125
126 while(guess_list[current_pointer] != '\0')
127 {
128 while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0')
129 current_pointer++;
130 if(guess_list[current_pointer] != '\0')
131 {
132 Guess g(guess_list + current_pointer);
133
134 guesses.push_back(g);
135
136 current_pointer++;
137 }
138 }
139
140 return guesses;
141}
142
144{
145 std::vector<Guess> ret = GuessLanguage(text);
146 return ret.empty() ? Guess() : ret[0];
147}
154std::vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
155{
156 textcat_t *tables = static_cast<textcat_t*>(h);
157
158 std::vector<Guess> lang;
159 if(!h){return lang;}
160
161 for (size_t i=0; i<tables->size; ++i)
162 {
163 if (tables->fprint_disable[i] & mask)
164 {
165 std::string langStr = "[";
166 langStr += fp_Name(tables->fprint[i]);
167 Guess g(langStr.c_str());
168 lang.push_back(g);
169 }
170 }
171
172 return lang;
173}
174
176{
177 return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
178}
179
181{
182 return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
183}
184
186{
187 return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
188}
189
190void SimpleGuesser::XableLanguage(const std::string& lang, char mask)
191{
192 textcat_t *tables = static_cast<textcat_t*>(h);
193
194 if(!h){return;}
195
196 for (size_t i=0; i<tables->size; i++)
197 {
198 std::string language(fp_Name(tables->fprint[i]));
199 if (startsAsciiCaseInsensitive(language,lang) == 0)
200 tables->fprint_disable[i] = mask;
201 }
202}
203
204void SimpleGuesser::EnableLanguage(const std::string& lang)
205{
206 XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
207}
208
209void SimpleGuesser::DisableLanguage(const std::string& lang)
210{
211 XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
212}
213
214void SimpleGuesser::SetDBPath(const char* path, const char* prefix)
215{
216 if (h)
217 textcat_Done(h);
218 h = special_textcat_Init(path, prefix);
219}
220
221/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: guess.hxx:31
std::vector< Guess > GetManagedLanguages(const char mask)
Is used to know which language is available, unavailable or both when mask = 0xF0,...
std::vector< Guess > GetAllManagedLanguages()
List all languages (possibly in guesses or not)
SimpleGuesser()
inits the object with conf file "./conf.txt"
void XableLanguage(const std::string &lang, char mask)
void DisableLanguage(const std::string &lang)
Mark a language disabled.
void SetDBPath(const char *thePathOfConfFile, const char *prefix)
Load a new DB of fingerprints.
~SimpleGuesser()
destroy the object
SimpleGuesser & operator=(const SimpleGuesser &sg)
Guess GuessPrimaryLanguage(const char *text)
Analyze a text and return the most probable language of the text.
std::vector< Guess > GetAvailableLanguages()
List all available languages (possibly to be in guesses)
std::vector< Guess > GuessLanguage(const char *text)
Analyze a text and return the most probable languages of the text.
std::vector< Guess > GetUnavailableLanguages()
List all Unavailable languages (disable for any reason)
void EnableLanguage(const std::string &lang)
Mark a language enabled.
#define TEXTCAT_RESULT_SHORT_STR
Definition: guess.cxx:39
#define GUESS_SEPARATOR_OPEN
Definition: guess.hxx:22
def text(shape, orig_st)
size
int i
SwNodeOffset min(const SwNodeOffset &a, const SwNodeOffset &b)
static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2)
TODO.
#define MAX_STRING_LENGTH_TO_ANALYSE