LibreOffice Module lingucomponent (master)  1
simpleguesser.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
30 #include <string.h>
31 #include <sstream>
32 #include <iostream>
33 
34 #ifdef SYSTEM_LIBEXTTEXTCAT
35 #include <libexttextcat/textcat.h>
36 #include <libexttextcat/common.h>
37 #include <libexttextcat/constants.h>
38 #include <libexttextcat/fingerprint.h>
39 #include <libexttextcat/utf8misc.h>
40 #else
41 #include <textcat.h>
42 #include <common.h>
43 #include <constants.h>
44 #include <fingerprint.h>
45 #include <utf8misc.h>
46 #endif
47 
48 #include <sal/types.h>
49 
50 #include<rtl/character.hxx>
51 #include "simpleguesser.hxx"
52 
53 using namespace std;
54 
55 static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2){
56  size_t i;
57  int ret = 0;
58 
59  size_t min = s1.length();
60  if (min > s2.length())
61  min = s2.length();
62 
63  for(i = 0; i < min && s2[i] && s1[i] && !ret; i++){
64  ret = rtl::toAsciiUpperCase(static_cast<unsigned char>(s1[i]))
65  - rtl::toAsciiUpperCase(static_cast<unsigned char>(s2[i]));
66  if(s1[i] == '.' || s2[i] == '.') {ret = 0;} //. is a neutral character
67  }
68  return ret;
69  }
70 
71 namespace {
72 
76 typedef struct textcat_t{
77 
78  void **fprint;
79  char *fprint_disable;
80  uint4 size;
81  uint4 maxsize;
82 
83  char output[MAXOUTPUTSIZE];
84 
85 } textcat_t;
86 // end of the 3 structs
87 
88 }
89 
91 {
92  h = nullptr;
93 }
94 
96  // Check for self-assignment!
97  if (this == &sg) // Same object?
98  return *this; // Yes, so skip assignment, and just return *this.
99 
100  if(h){textcat_Done(h);}
101  h = sg.h;
102  return *this;
103 }
104 
106 {
107  if(h){textcat_Done(h);}
108 }
109 
113 vector<Guess> SimpleGuesser::GuessLanguage(const char* text)
114 {
115  vector<Guess> guesses;
116 
117  if (!h)
118  return guesses;
119 
120  int len = strlen(text);
121 
124 
125  const char *guess_list = textcat_Classify(h, text, len);
126 
127  if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0)
128  return guesses;
129 
130  int current_pointer = 0;
131 
132  for(int i = 0; guess_list[current_pointer] != '\0'; i++)
133  {
134  while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0')
135  current_pointer++;
136  if(guess_list[current_pointer] != '\0')
137  {
138  Guess g(guess_list + current_pointer);
139 
140  guesses.push_back(g);
141 
142  current_pointer++;
143  }
144  }
145 
146  return guesses;
147 }
148 
150 {
151  vector<Guess> ret = GuessLanguage(text);
152  return ret.empty() ? Guess() : ret[0];
153 }
160 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
161 {
162  textcat_t *tables = static_cast<textcat_t*>(h);
163 
164  vector<Guess> lang;
165  if(!h){return lang;}
166 
167  for (size_t i=0; i<tables->size; ++i)
168  {
169  if (tables->fprint_disable[i] & mask)
170  {
171  string langStr = "[";
172  langStr += fp_Name(tables->fprint[i]);
173  Guess g(langStr.c_str());
174  lang.push_back(g);
175  }
176  }
177 
178  return lang;
179 }
180 
182 {
183  return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
184 }
185 
187 {
188  return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
189 }
190 
192 {
193  return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
194 }
195 
196 void SimpleGuesser::XableLanguage(const string& lang, char mask)
197 {
198  textcat_t *tables = static_cast<textcat_t*>(h);
199 
200  if(!h){return;}
201 
202  for (size_t i=0; i<tables->size; i++)
203  {
204  string language(fp_Name(tables->fprint[i]));
205  if (startsAsciiCaseInsensitive(language,lang) == 0)
206  tables->fprint_disable[i] = mask;
207  }
208 }
209 
210 void SimpleGuesser::EnableLanguage(const string& lang)
211 {
212  XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
213 }
214 
215 void SimpleGuesser::DisableLanguage(const string& lang)
216 {
217  XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
218 }
219 
220 void SimpleGuesser::SetDBPath(const char* path, const char* prefix)
221 {
222  if (h)
223  textcat_Done(h);
224  h = special_textcat_Init(path, prefix);
225 }
226 
227 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
void XableLanguage(const string &lang, char mask)
#define GUESS_SEPARATOR_OPEN
Definition: guess.hxx:22
void DisableLanguage(const string &lang)
Mark a language disabled.
#define MAX_STRING_LENGTH_TO_ANALYSE
SimpleGuesser()
inits the object with conf file "./conf.txt"
vector< Guess > GetAvailableLanguages()
List all available languages (possibly to be in guesses)
#define min(a, b)
#define TEXTCAT_RESULT_SHORT_STR
Definition: guess.cxx:40
~SimpleGuesser()
destroy the object
int i
size
vector< Guess > GuessLanguage(const char *text)
Analyze a text and return the most probable languages of the text.
vector< Guess > GetAllManagedLanguages()
List all languages (possibly in guesses or not)
static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2)
void EnableLanguage(const string &lang)
Mark a language enabled.
vector< Guess > GetUnavailableLanguages()
List all Unavailable languages (disable for any reason)
vector< Guess > GetManagedLanguages(const char mask)
Is used to know which language is available, unavailable or both when mask = 0xF0, return only Available when mask = 0x0F, return only Unavailable when mask = 0xFF, return both Available and Unavailable.
Definition: guess.hxx:33
SimpleGuesser & operator=(const SimpleGuesser &sg)
void SetDBPath(const char *thePathOfConfFile, const char *prefix)
Load a new DB of fingerprints.
sal_Int32 h
Guess GuessPrimaryLanguage(const char *text)
Analyze a text and return the most probable language of the text.