LibreOffice Module lingucomponent (master)  1
simpleguesser.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
30 #include <string.h>
31 #include <sstream>
32 #include <iostream>
33 
34 #ifdef SYSTEM_LIBEXTTEXTCAT
35 #include <libexttextcat/textcat.h>
36 #include <libexttextcat/common.h>
37 #include <libexttextcat/constants.h>
38 #include <libexttextcat/fingerprint.h>
39 #include <libexttextcat/utf8misc.h>
40 #else
41 #include <textcat.h>
42 #include <common.h>
43 #include <constants.h>
44 #include <fingerprint.h>
45 #include <utf8misc.h>
46 #endif
47 
48 #include <sal/types.h>
49 
50 #include<rtl/character.hxx>
51 #include "simpleguesser.hxx"
52 
53 using namespace std;
54 
55 static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2){
56  size_t i;
57  int ret = 0;
58 
59  size_t min = s1.length();
60  if (min > s2.length())
61  min = s2.length();
62 
63  for(i = 0; i < min && s2[i] && s1[i] && !ret; i++){
64  ret = rtl::toAsciiUpperCase(static_cast<unsigned char>(s1[i]))
65  - rtl::toAsciiUpperCase(static_cast<unsigned char>(s2[i]));
66  if(s1[i] == '.' || s2[i] == '.') {ret = 0;} //. is a neutral character
67  }
68  return ret;
69  }
70 
74 typedef struct textcat_t{
75 
76  void **fprint;
78  uint4 size;
79  uint4 maxsize;
80 
81  char output[MAXOUTPUTSIZE];
82 
83 } textcat_t;
84 // end of the 3 structs
85 
87 {
88  h = nullptr;
89 }
90 
92  // Check for self-assignment!
93  if (this == &sg) // Same object?
94  return *this; // Yes, so skip assignment, and just return *this.
95 
96  if(h){textcat_Done(h);}
97  h = sg.h;
98  return *this;
99 }
100 
102 {
103  if(h){textcat_Done(h);}
104 }
105 
109 vector<Guess> SimpleGuesser::GuessLanguage(const char* text)
110 {
111  vector<Guess> guesses;
112 
113  if (!h)
114  return guesses;
115 
116  int len = strlen(text);
117 
120 
121  const char *guess_list = textcat_Classify(h, text, len);
122 
123  if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0)
124  return guesses;
125 
126  int current_pointer = 0;
127 
128  for(int i = 0; guess_list[current_pointer] != '\0'; i++)
129  {
130  while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0')
131  current_pointer++;
132  if(guess_list[current_pointer] != '\0')
133  {
134  Guess g(guess_list + current_pointer);
135 
136  guesses.push_back(g);
137 
138  current_pointer++;
139  }
140  }
141 
142  return guesses;
143 }
144 
146 {
147  vector<Guess> ret = GuessLanguage(text);
148  return ret.empty() ? Guess() : ret[0];
149 }
156 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
157 {
158  textcat_t *tables = static_cast<textcat_t*>(h);
159 
160  vector<Guess> lang;
161  if(!h){return lang;}
162 
163  for (size_t i=0; i<tables->size; ++i)
164  {
165  if (tables->fprint_disable[i] & mask)
166  {
167  string langStr = "[";
168  langStr += fp_Name(tables->fprint[i]);
169  Guess g(langStr.c_str());
170  lang.push_back(g);
171  }
172  }
173 
174  return lang;
175 }
176 
178 {
179  return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
180 }
181 
183 {
184  return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
185 }
186 
188 {
189  return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
190 }
191 
192 void SimpleGuesser::XableLanguage(const string& lang, char mask)
193 {
194  textcat_t *tables = static_cast<textcat_t*>(h);
195 
196  if(!h){return;}
197 
198  for (size_t i=0; i<tables->size; i++)
199  {
200  string language(fp_Name(tables->fprint[i]));
201  if (startsAsciiCaseInsensitive(language,lang) == 0)
202  tables->fprint_disable[i] = mask;
203  }
204 }
205 
206 void SimpleGuesser::EnableLanguage(const string& lang)
207 {
208  XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
209 }
210 
211 void SimpleGuesser::DisableLanguage(const string& lang)
212 {
213  XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
214 }
215 
216 void SimpleGuesser::SetDBPath(const char* path, const char* prefix)
217 {
218  if (h)
219  textcat_Done(h);
220  h = special_textcat_Init(path, prefix);
221 }
222 
223 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
void XableLanguage(const string &lang, char mask)
#define GUESS_SEPARATOR_OPEN
Definition: guess.hxx:22
void ** fprint
void DisableLanguage(const string &lang)
Mark a language disabled.
#define MAX_STRING_LENGTH_TO_ANALYSE
SimpleGuesser()
inits the object with conf file "./conf.txt"
vector< Guess > GetAvailableLanguages()
List all available languages (possibly to be in guesses)
#define min(a, b)
#define TEXTCAT_RESULT_SHORT_STR
Definition: guess.cxx:40
~SimpleGuesser()
destroy the object
int i
vector< Guess > GuessLanguage(const char *text)
Analyze a text and return the most probable languages of the text.
char * fprint_disable
vector< Guess > GetAllManagedLanguages()
List all languages (possibly in guesses or not)
static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2)
struct textcat_t textcat_t
This following structure is from textcat.c.
void EnableLanguage(const string &lang)
Mark a language enabled.
vector< Guess > GetUnavailableLanguages()
List all Unavailable languages (disable for any reason)
vector< Guess > GetManagedLanguages(const char mask)
Is used to know which language is available, unavailable or both when mask = 0xF0, return only Available when mask = 0x0F, return only Unavailable when mask = 0xFF, return both Available and Unavailable.
Definition: guess.hxx:33
SimpleGuesser & operator=(const SimpleGuesser &sg)
This following structure is from textcat.c.
void SetDBPath(const char *thePathOfConfFile, const char *prefix)
Load a new DB of fingerprints.
sal_Int32 h
Guess GuessPrimaryLanguage(const char *text)
Analyze a text and return the most probable language of the text.