summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-02 04:43:01 +0000
committerSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-02 04:43:01 +0000
commit8b75f9bb6a09d54d634ff661655659951378aa2c (patch)
tree611f800f33ca919d37c724957fcc8c2b2bccb342
parente3f702e83a26468ee44f3f342a7a40a252f4603c (diff)
downloadlibrcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.gz
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.bz2
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.xz
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.zip
Language autodetection
- rccConfigRecode function's are added - Language autodetection using aspell is added - Translation in 3 modes: "To English Language", "Skip English Translation", "Full" - Example2 to demonstrate translation possibilities
-rw-r--r--ToDo7
-rw-r--r--configure.in8
-rw-r--r--examples/Makefile.am3
-rw-r--r--examples/example2.c13
-rw-r--r--examples/input-russian.txt2
-rw-r--r--examples/rcc-gtk-config.c6
-rw-r--r--m4/aspell.m444
-rw-r--r--src/Makefile.am5
-rw-r--r--src/librcc.h119
-rw-r--r--src/lng.c45
-rw-r--r--src/lng.h2
-rw-r--r--src/lngconfig.c229
-rw-r--r--src/lngconfig.h7
-rw-r--r--src/rccconfig.c27
-rw-r--r--src/rccconfig.h5
-rw-r--r--src/rccexternal.c2
-rw-r--r--src/rcciconv.c5
-rw-r--r--src/rcciconv.h2
-rw-r--r--src/rccspell.c63
-rw-r--r--src/rccspell.h29
-rw-r--r--src/rccstring.c8
-rw-r--r--src/rccstring.h1
-rw-r--r--src/rcctranslate.c10
-rw-r--r--src/recode.c233
-rw-r--r--ui/rccnames.c4
25 files changed, 795 insertions, 84 deletions
diff --git a/ToDo b/ToDo
index 8167c92..fdb843f 100644
--- a/ToDo
+++ b/ToDo
@@ -1,4 +1,4 @@
-0.2.x:
+0.3.x:
- Common encodings:
+ Provide way to add to all languages several default Unicode encodings (UTF8, UTF16, UTF16BE)
+ Special type of classes to select only from Unicode encodings (or even just specified subset of encodings)
@@ -8,9 +8,8 @@
* Code some options in charset name. (SpecialEncodingPrefix_Encoding_EncodingOptions)
- Buffer managment:
+ SetBufferSize ( 0 - autogrow )
- - Language autodetection
- + Using spellchecker (aspell)
- - Look on ofline translation libraries
+ - Look on ofline translation libraries and other possibilities to improove
+ translation and language detection.
on request:
- Multibyte(not-UTF8) support for FS classes
diff --git a/configure.in b/configure.in
index b833095..16051b5 100644
--- a/configure.in
+++ b/configure.in
@@ -194,6 +194,13 @@ else
fi
fi
+AM_PATH_ASPELL([
+ AC_DEFINE(HAVE_ASPELL,1,[Defines if aspell is available])
+ HAVE_ASPELL=yes
+],[
+ HAVE_ASPELL=no
+])
+
dnl Checks for typedefs, structures, and compiler characteristics.
AC_C_CONST
@@ -209,6 +216,7 @@ echo " Enca Charset Detection Support: $HAVE_ENCA"
echo " LibRCD Charset Detection Support: $HAVE_RCD"
echo ""
echo " Multilanguage support with DB4: $HAVE_BDB"
+echo " Language autodetection using aspell: $HAVE_ASPELL"
echo " Libtranslate support: $HAVE_LIBTRANSLATE"
echo " Libtranslate Timed Translate: $HAVE_LIBTRANSLATE_TIMED_TRANSLATE"
echo ""
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 99b7506..710b7dc 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -20,6 +20,9 @@ endif
EXTRA_DIST= input-russian.txt rcc.xml rcc-example.xml mpg123-rcc.patch
+test: example2
+ cat input-russian.txt | ./example2
+
test-russian: example2
cat input-russian.txt | ./example2 ru
diff --git a/examples/example2.c b/examples/example2.c
index cc4a3fa..5ef3efb 100644
--- a/examples/example2.c
+++ b/examples/example2.c
@@ -24,27 +24,26 @@ int main(int argc, char *argv[]) {
rccInit();
rccInitDefaultContext(NULL, 0, 0, classes, 0);
rccInitDb4(NULL, "example", 0);
- rccSetOption(NULL, RCC_OPTION_TRANSLATE, 1);
-
-// rccExternalInit();
-// rccExternalFree();
+ rccSetOption(NULL, RCC_OPTION_TRANSLATE, RCC_OPTION_TRANSLATE_FULL);
current_language_id = rccGetCurrentLanguage(NULL);
english_language_id = rccGetLanguageByName(NULL, "en");
if (argc>1) rccSetLanguageByName(NULL, argv[1]);
+ else rccSetOption(NULL, RCC_OPTION_AUTODETECT_LANGUAGE, 1);
language_id = rccGetCurrentLanguage(NULL);
language = rccGetCurrentLanguageName(NULL);
if (language) printf("Current Language: %s\n\n", language);
- else printf("Unable Detect Language\n\n");
+ else {
+ printf("Unable Detect Language, using english\n\n");
+ rccSetLanguageByName(NULL, "en");
+ }
while (fgets(buf,255,stdin)) {
if (strlen(buf)<2) break;
- rccSetLanguage(NULL, language_id);
rccstring = rccFrom(NULL, 0, buf);
if (rccstring) {
- rccSetLanguage(NULL, english_language_id);
recoded = rccTo(NULL, 1, rccstring);
if (recoded) {
printf(recoded);
diff --git a/examples/input-russian.txt b/examples/input-russian.txt
index 8ea6e44..b89c105 100644
--- a/examples/input-russian.txt
+++ b/examples/input-russian.txt
@@ -2,3 +2,5 @@
ïðèâåò
¯à¨¢¥â
привет
+Good Morning
+Guten Abend
diff --git a/examples/rcc-gtk-config.c b/examples/rcc-gtk-config.c
index d0775a6..7758f09 100644
--- a/examples/rcc-gtk-config.c
+++ b/examples/rcc-gtk-config.c
@@ -38,15 +38,15 @@ int main (int argc, char *argv[])
if (argc<1) config = argv[0];
else config = argv[1];
+ gtk_set_locale ();
+ gtk_init (&argc, &argv);
+
rccInit();
rccUiInit();
ctx = rccCreateContext(NULL, 0, 0, classes, 0);
rccLoad(ctx, config);
uictx = rccUiCreateContext(ctx);
- gtk_set_locale ();
- gtk_init (&argc, &argv);
-
window1 = gtk_window_new (GTK_WINDOW_TOPLEVEL);
gtk_window_set_policy(GTK_WINDOW (window1), FALSE, FALSE, TRUE);
gtk_window_set_title (GTK_WINDOW (window1), "LibRCC Config");
diff --git a/m4/aspell.m4 b/m4/aspell.m4
new file mode 100644
index 0000000..164e534
--- /dev/null
+++ b/m4/aspell.m4
@@ -0,0 +1,44 @@
+# This file is part of GNOME Translate.
+#
+# Copyright (C) 2004 Jean-Yves Lefort.
+#
+# As a special exception to the GNOME Translate licensing terms,
+# Jean-Yves Lefort gives unlimited permission to copy, distribute and
+# modify this file.
+
+dnl AM_PATH_ASPELL([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl
+AC_DEFUN([AM_PATH_ASPELL],
+[ASPELL_CFLAGS=""
+ASPELL_LIBS="-laspell"
+
+ac_save_CFLAGS="$CFLAGS"
+ac_save_LIBS="$LIBS"
+CFLAGS="$CFLAGS $ASPELL_CFLAGS"
+LIBS="$LIBS $ASPELL_LIBS"
+
+AC_MSG_CHECKING([for Aspell])
+AC_RUN_IFELSE([
+#include <aspell.h>
+
+int main() {
+ new_aspell_config;
+
+ exit(0);
+}
+], [found=yes], [found=no], [found=yes])
+AC_MSG_RESULT($found)
+
+CFLAGS="$ac_save_CFLAGS"
+LIBS="$ac_save_LIBS"
+
+if test $found = yes; then
+ ifelse([$1],, :, [$1])
+else
+ ASPELL_CFLAGS=""
+ ASPELL_LIBS=""
+ ifelse([$2],, :, [$2])
+fi
+
+AC_SUBST(ASPELL_CFLAGS)
+AC_SUBST(ASPELL_LIBS)])
diff --git a/src/Makefile.am b/src/Makefile.am
index baa08a4..4ba3c35 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -12,6 +12,7 @@ librcc_la_SOURCES = librcc.c \
fake_enca.h fake_rcd.h \
rccenca.c rccenca.h \
rccdb4.c rccdb4.h \
+ rccspell.c rccspell.h \
engine.c engine.h \
rccstring.c rccstring.h \
rccxml.c rccxml.h \
@@ -22,7 +23,7 @@ librcc_la_SOURCES = librcc.c \
internal.h
include_HEADERS = librcc.h
-AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@
-librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@
+AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ @ASPELL_CFLAGS@
+librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@
librcc_la_LDFLAGS = -version-info @LIBRCC_VERSION_INFO@
diff --git a/src/librcc.h b/src/librcc.h
index 52e6be4..d08937e 100644
--- a/src/librcc.h
+++ b/src/librcc.h
@@ -364,6 +364,23 @@ typedef int rcc_option_value;
#define RCC_OPTION_LEARNING_FLAG_LEARN 2
/**
+ * Switch translation off.
+ */
+#define RCC_OPTION_TRANSLATE_OFF 0
+/**
+ * Translate data to english language (Current language don't matter).
+ */
+#define RCC_OPTION_TRANSLATE_TO_ENGLISH 1
+/**
+ * Skip translation of the english text.
+ */
+#define RCC_OPTION_TRANSLATE_SKIP_ENGLISH 2
+/**
+ * Translate whole data to the current language.
+ */
+#define RCC_OPTION_TRANSLATE_FULL 3
+
+/**
* List of options available
*/
typedef enum rcc_option_t {
@@ -371,8 +388,9 @@ typedef enum rcc_option_t {
RCC_OPTION_AUTODETECT_FS_TITLES, /**< Detect titles of #RCC_CLASS_FS classes */
RCC_OPTION_AUTODETECT_FS_NAMES, /**< Try to find encoding of #RCC_CLASS_FS by accessing fs */
RCC_OPTION_CONFIGURED_LANGUAGES_ONLY, /**< Use only configured languages or languages with auto-engines */
- RCC_OPTION_TRANSLATE, /**< Translate #rcc_string if it's language differs from current one */
RCC_OPTION_AUTOENGINE_SET_CURRENT, /**< If enabled autodetection engine will set current charset */
+ RCC_OPTION_AUTODETECT_LANGUAGE, /**< Enables language detection */
+ RCC_OPTION_TRANSLATE, /**< Translate #rcc_string if it's language differs from current one */
RCC_MAX_OPTIONS
} rcc_option;
@@ -970,6 +988,26 @@ int rccTranslateSetTimeout(rcc_translate translate, unsigned long us);
char *rccTranslate(rcc_translate translate, const char *buf);
/* recode.c */
+
+/**
+ * Tries to detect language of string
+ * @param ctx is working context ( or default one if NULL supplied )
+ * @param class_id is encoding class
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @result is language_id or -1 if autodetection is failed
+ */
+rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len);
+/**
+ * Tries to detect charset of string
+ * @param ctx is working context ( or default one if NULL supplied )
+ * @param class_id is encoding class
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @result is auto_charset_id or -1 if autodetection is failed
+ */
+int rccDetectCharset(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len);
+
/**
* Recode string from specified encoding class to #rcc_string. Encoding detection engines and
* recoding cache are used (if possible) to detect original 'buf' encoding. Otherwise the
@@ -1079,7 +1117,7 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char
* @param rlen in rlen the size of recoded string will be returned.
* @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
*/
-char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen);
+rcc_string rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen);
/**
* Recode string between specified encodings.
*
@@ -1094,6 +1132,77 @@ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const ch
char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to, const char *buf, size_t len, size_t *rlen);
+/**
+ * Tries to detect charset of string
+ * @param config is language configuration
+ * @param class_id is encoding class
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @result is auto_charset_id or -1 if autodetection is failed
+ */
+rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len);
+
+/**
+ * Recode string from specified encoding class to #rcc_string. Encoding detection engines and
+ * recoding cache are used (if possible) to detect original 'buf' encoding. Otherwise the
+ * preconfigured encoding of class is assumed.
+ *
+ * @param config is language configuration
+ * @param class_id is encoding class
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
+ */
+rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len);
+/**
+ * Recode string from #rcc_string to specified encoding class. If encoding class is of
+ * 'File System' type, the autoprobing for file names can be performed. In the other cases
+ * the rcc_string will be recoded in preconfigured class encoding.
+ *
+ * @param config is language configuration
+ * @param class_id is encoding class
+ * @param buf is original zero terminated string
+ * @param rlen in rlen the size of recoded string will be returned.
+ * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
+ */
+char *rccConfigSizedTo(rcc_language_config config, rcc_class_id class_id, rcc_const_string buf, size_t *rlen);
+/**
+ * Recode string between different encoding classes. The conversion is relays on rccConfigSizedFrom
+ * and rccConfigSizedTo functions.
+ * @see rccConfigSizedFrom
+ * @see rccConfigSizedTo
+ *
+ * @param config is language configuration
+ * @param from is source encoding class
+ * @param to is destination encoding class
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @param rlen in rlen the size of recoded string will be returned.
+ * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
+ */
+char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen);
+/**
+ * Recode string from specified encoding to #rcc_string.
+ *
+ * @param config is language configuration
+ * @param charset is source encoding
+ * @param buf is original string (perhaps not zero terminated)
+ * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function.
+ * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
+ */
+rcc_string rccConfigSizedRecodeFromCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen);
+/**
+ * Recode string from #rcc_string to specified encoding.
+ *
+ * @param config is language configuration
+ * @param charset is destination encoding
+ * @param buf is original zero terminated string
+ * @param rlen in rlen the size of recoded string will be returned.
+ * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory.
+ */
+char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen);
+
+
#define rccFrom(ctx, class_id, buf) rccSizedFrom(ctx, class_id, buf, 0)
#define rccTo(ctx, class_id, buf) rccSizedTo(ctx, class_id, buf, NULL)
#define rccRecode(ctx, from, to, buf) rccSizedRecode(ctx, from, to, buf, 0, NULL)
@@ -1104,6 +1213,12 @@ char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to,
#define rccRecodeFromCharset(ctx, class_id, charset, buf) rccSizedRecodeFromCharset(ctx, class_id, charset, buf, 0, NULL)
#define rccRecodeCharsets(ctx, from, to, buf) rccSizedRecodeCharsets(ctx, from, to, buf, 0, NULL)
+#define rccConfigFrom(ctx, class_id, buf) rccConfigSizedFrom(ctx, class_id, buf, 0)
+#define rccConfigTo(ctx, class_id, buf) rccConfigSizedTo(ctx, class_id, buf, NULL)
+#define rccConfigRecode(ctx, from, to, buf) rccConfigSizedRecode(ctx, from, to, buf, 0, NULL)
+#define rccConfigRecodeToCharset(ctx, class_id, charset, buf) rccConfigSizedRecodeToCharset(ctx, class_id, charset, buf, 0, NULL)
+#define rccConfigRecodeFromCharset(ctx, class_id, charset, buf) rccConfigSizedRecodeFromCharset(ctx, class_id, charset, buf, 0, NULL)
+
/*******************************************************************************
******************************** Options ***************************************
*******************************************************************************/
diff --git a/src/lng.c b/src/lng.c
index b0ce7cd..4589de6 100644
--- a/src/lng.c
+++ b/src/lng.c
@@ -36,11 +36,39 @@ rcc_language_id rccGetLanguageByName(rcc_context ctx, const char *name) {
return (rcc_language_id)-1;
}
-static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) {
- unsigned int i;
+int rccCheckLanguageUsability(rcc_context ctx, rcc_language_id language_id) {
+ rcc_language_config config;
rcc_option_value clo;
rcc_engine_ptr *engines;
- rcc_language_config config;
+ rcc_charset *charsets;
+
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return 0;
+ }
+ if (language_id>=ctx->n_languages) return 0;
+
+ language_id = rccGetRealLanguage(ctx, language_id);
+
+ clo = rccGetOption(ctx, RCC_OPTION_CONFIGURED_LANGUAGES_ONLY);
+ if (clo) {
+ config = rccCheckConfig(ctx, (rcc_language_id)language_id);
+ if ((!config)||(!config->configured)) {
+ charsets = ctx->languages[language_id]->charsets;
+ if ((charsets[0])&&(charsets[1])&&(charsets[2])) {
+ if (clo == 1) {
+ engines = ctx->languages[language_id]->engines;
+ if ((!engines[0])||(!engines[1])) return 0;
+ } else return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+
+static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) {
+ unsigned int i;
char stmp[RCC_MAX_LANGUAGE_CHARS+1];
if (ctx->default_language) return ctx->default_language;
@@ -48,16 +76,7 @@ static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) {
if (!rccLocaleGetLanguage(stmp, ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) {
for (i=0;ctx->languages[i];i++) {
if (!strcmp(ctx->languages[i]->sn, stmp)) {
- clo = rccGetOption(ctx, RCC_OPTION_CONFIGURED_LANGUAGES_ONLY);
- if (clo) {
- config = rccCheckConfig(ctx, (rcc_language_id)i);
- if ((!config)||(!config->configured)) {
- if (clo == 1) {
- engines = ctx->languages[i]->engines;
- if ((!engines[0])||(!engines[1])) break;
- } else break;
- }
- }
+ if (!rccCheckLanguageUsability(ctx, (rcc_language_id)i)) break;
ctx->default_language = (rcc_language_id)i;
return (rcc_language_id)i;
}
diff --git a/src/lng.h b/src/lng.h
index 7602e10..f3a7735 100644
--- a/src/lng.h
+++ b/src/lng.h
@@ -4,6 +4,8 @@
#include "internal.h"
#include "lngconfig.h"
+
+int rccCheckLanguageUsability(rcc_context ctx, rcc_language_id language_id);
rcc_language_ptr rccGetLanguagePointer(rcc_context ctx, rcc_language_id language_id);
#define rccGetCurrentEnginePointer(ctx) rccConfigGetCurrentEnginePointer(ctx->current_config)
diff --git a/src/lngconfig.c b/src/lngconfig.c
index c50ee74..26d0779 100644
--- a/src/lngconfig.c
+++ b/src/lngconfig.c
@@ -2,9 +2,12 @@
#include <stdlib.h>
#include <string.h>
+#include "../config.h"
+
#include "internal.h"
#include "rccconfig.h"
#include "rcclocale.h"
+#include "lng.h"
rcc_engine_ptr rccConfigGetEnginePointer(rcc_language_config config, rcc_engine_id engine_id) {
unsigned int i;
@@ -165,6 +168,7 @@ int rccConfigInit(rcc_language_config config, rcc_context ctx) {
config->fsiconv = NULL;
config->trans = NULL;
+ config->entrans = NULL;
config->ctx = ctx;
config->language = NULL;
@@ -172,6 +176,7 @@ int rccConfigInit(rcc_language_config config, rcc_context ctx) {
config->engine = -1;
config->default_charset = dcharsets;
config->configured = 0;
+ config->speller = NULL;
config->iconv_to = iconv_to;
config->configure = 1;
@@ -204,6 +209,10 @@ void rccConfigClear(rcc_language_config config) {
rccTranslateClose(config->trans);
config->trans = NULL;
}
+ if (config->entrans) {
+ rccTranslateClose(config->entrans);
+ config->entrans = NULL;
+ }
if (config->iconv_to) {
free(config->iconv_to);
config->iconv_to = NULL;
@@ -216,31 +225,55 @@ void rccConfigClear(rcc_language_config config) {
free(config->default_charset);
config->default_charset = NULL;
}
+ if (config->speller) {
+ rccSpellerFree(config->speller);
+ config->speller = NULL;
+ }
}
}
-rcc_language_config rccCheckConfig(rcc_context ctx, rcc_language_id language_id) {
- rcc_language_id new_language_id;
-
- new_language_id = rccGetRealLanguage(ctx, language_id);
- if ((new_language_id == (rcc_language_id)-1)||(new_language_id != language_id)) return NULL;
- if (!ctx->configs[language_id].charset) return NULL;
- if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL;
+static rcc_language_config rccGetConfigPointer(rcc_context ctx, rcc_language_id language_id, rcc_language_id *r_language_id) {
+
+ language_id = rccGetRealLanguage(ctx, language_id);
+ if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL;
+ if (r_language_id) *r_language_id = language_id;
return ctx->configs + language_id;
}
+rcc_language_config rccCheckConfig(rcc_context ctx, rcc_language_id language_id) {
+ rcc_language_config config;
+
+ config = rccGetConfigPointer(ctx, language_id, NULL);
+ if ((config)&&(!config->charset)) return NULL;
+
+ return config;
+}
+
+
+rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language_id) {
+ rcc_language_config config;
+
+ config = rccGetConfigPointer(ctx, language_id, &language_id);
+ if (config) {
+ if (!rccCheckLanguageUsability(ctx, language_id)) return NULL;
+ if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL;
+ config->language = ctx->languages[language_id];
+ }
+
+ return config;
+}
+
rcc_language_config rccGetConfig(rcc_context ctx, rcc_language_id language_id) {
- language_id = rccGetRealLanguage(ctx, language_id);
- if (language_id == (rcc_language_id)-1) return NULL;
- if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL;
-
- if (!ctx->configs[language_id].charset) {
- if (rccConfigInit(ctx->configs+language_id, ctx)) return NULL;
- }
+ rcc_language_config config;
- ctx->configs[language_id].language = ctx->languages[language_id];
- return ctx->configs + language_id;
+ config = rccGetConfigPointer(ctx, language_id, &language_id);
+ if (config) {
+ if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL;
+ config->language = ctx->languages[language_id];
+ }
+
+ return config;
}
rcc_language_config rccGetConfigByName(rcc_context ctx, const char *name) {
@@ -261,6 +294,15 @@ rcc_language_config rccGetCurrentConfig(rcc_context ctx) {
return rccGetConfig(ctx, language_id);
}
+rcc_speller rccConfigGetSpeller(rcc_language_config config) {
+ if (!config) return NULL;
+
+ if (config->speller) return config->speller;
+
+ config->speller = rccSpellerCreate(config->language->sn);
+ return config->speller;
+}
+
rcc_engine_id rccConfigGetSelectedEngine(rcc_language_config config) {
if (!config) return (rcc_engine_id)-1;
@@ -532,6 +574,161 @@ int rccConfigConfigure(rcc_language_config config) {
return 0;
}
+
+rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) {
+ rcc_context ctx;
+ rcc_string result;
+ rcc_option_value usedb4;
+ rcc_autocharset_id charset_id;
+ const char *charset;
+
+
+ if (!config) return NULL;
+ ctx = config->ctx;
+
+ if (rccStringSizedCheck(buf, len)) return NULL;
+
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+
+ if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
+ result = rccDb4GetKey(ctx->db4ctx, buf, len);
+ if (result) {
+ if (rccStringFixID(result, ctx)) free(result);
+ else return result;
+ }
+ }
+
+ charset_id = rccConfigDetectCharset(config, class_id, buf, len);
+ if (charset_id != (rcc_autocharset_id)-1)
+ charset = rccConfigGetAutoCharsetName(config, charset_id);
+ else
+ charset = rccConfigGetCurrentCharsetName(config, class_id);
+
+ if (charset) {
+ result = rccSizedFromCharset(ctx, charset, buf, len);
+ if (result) rccStringChangeID(result, rccGetLanguageByName(ctx, config->language->sn));
+ return result;
+ }
+
+ return NULL;
+}
+
+char *rccConfigSizedTo(rcc_language_config config, rcc_class_id class_id, rcc_const_string buf, size_t *rlen) {
+ rcc_context ctx;
+ const char *charset;
+
+ if (!config) return NULL;
+ ctx = config->ctx;
+
+ charset = rccConfigGetCurrentCharsetName(config, class_id);
+
+ if (charset)
+ return rccSizedToCharset(ctx, charset, buf, rlen);
+
+ return NULL;
+}
+
+
+char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen) {
+ rcc_context ctx;
+ rcc_string result;
+ rcc_option_value usedb4;
+ rcc_autocharset_id charset_id;
+ rcc_string stmp;
+ const char *tocharset, *fromcharset;
+
+
+ if (!config) return NULL;
+ ctx = config->ctx;
+
+ if (rccStringSizedCheck(buf, len)) return NULL;
+
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+
+ if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
+ stmp = rccDb4GetKey(ctx->db4ctx, buf, len);
+ if (stmp) {
+ if (rccStringFixID(stmp, ctx)) free(stmp);
+ else {
+ result = rccConfigSizedTo(config, to, stmp, rlen);
+ free(stmp);
+ return result;
+ }
+ }
+ }
+
+ charset_id = rccConfigDetectCharset(config, from, buf, len);
+ if (charset_id != (rcc_autocharset_id)-1)
+ fromcharset = rccConfigGetAutoCharsetName(config, charset_id);
+ else
+ fromcharset = rccConfigGetCurrentCharsetName(config, from);
+
+ tocharset = rccConfigGetCurrentCharsetName(config, to);
+
+ if ((fromcharset)&&(tocharset))
+ return rccSizedRecodeCharsets(ctx, fromcharset, tocharset, buf, len, rlen);
+
+ return NULL;
+
+}
+
+
+char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen) {
+ rcc_context ctx;
+ rcc_string result;
+ rcc_option_value usedb4;
+ rcc_autocharset_id charset_id;
+ rcc_string stmp;
+ const char *ocharset;
+
+
+ if (!config) return NULL;
+ ctx = config->ctx;
+
+ if (rccStringSizedCheck(buf, len)) return NULL;
+
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+
+ if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
+ stmp = rccDb4GetKey(ctx->db4ctx, buf, len);
+ if (stmp) {
+ if (rccStringFixID(stmp, ctx)) free(stmp);
+ else {
+ result = rccSizedToCharset(ctx, charset, stmp, rlen);
+ free(stmp);
+ return result;
+ }
+ }
+ }
+
+ charset_id = rccConfigDetectCharset(config, class_id, buf, len);
+ if (charset_id != (rcc_autocharset_id)-1)
+ ocharset = rccConfigGetAutoCharsetName(config, charset_id);
+ else
+ ocharset = rccConfigGetCurrentCharsetName(config, class_id);
+
+ if (ocharset)
+ return rccSizedRecodeCharsets(ctx, ocharset, charset, buf, len, rlen);
+
+ return NULL;
+}
+
+char *rccConfigSizedRecodeFromCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) {
+ rcc_context ctx;
+ const char *ocharset;
+
+ if (!config) return NULL;
+ ctx = config->ctx;
+
+ ocharset = rccConfigGetCurrentCharsetName(config, class_id);
+
+ if (ocharset)
+ return rccSizedRecodeCharsets(ctx, charset, ocharset, buf, len, rlen);
+
+ return NULL;
+}
+
+
/*
rcc_option_value options[RCC_MAX_OPTIONS];
diff --git a/src/lngconfig.h b/src/lngconfig.h
index 92cc050..9d23139 100644
--- a/src/lngconfig.h
+++ b/src/lngconfig.h
@@ -3,6 +3,7 @@
#include "rcciconv.h"
#include "rcctranslate.h"
+#include "rccspell.h"
struct rcc_language_config_t {
rcc_context ctx;
@@ -17,8 +18,10 @@ struct rcc_language_config_t {
unsigned char configured;
+ rcc_speller speller;
rcc_translate trans;
rcc_language_id translang;
+ rcc_translate entrans;
rcc_iconv fsiconv;
};
@@ -30,9 +33,13 @@ rcc_engine_ptr rccConfigCheckEnginePointer(rcc_language_config config, rcc_engin
rcc_engine_ptr rccConfigGetCurrentEnginePointer(rcc_language_config config);
rcc_engine_ptr rccConfigCheckCurrentEnginePointer(rcc_language_config config);
+rcc_speller rccConfigGetSpeller(rcc_language_config config);
+
int rccConfigInit(rcc_language_config config, rcc_context ctx);
void rccConfigClear(rcc_language_config config);
+rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language_id);
+
int rccConfigConfigure(rcc_language_config config);
rcc_charset_id rccConfigGetLocaleUnicodeCharset(rcc_language_config config, const char *locale_variable);
diff --git a/src/rccconfig.c b/src/rccconfig.c
index ed6d30a..f820606 100644
--- a/src/rccconfig.c
+++ b/src/rccconfig.c
@@ -12,13 +12,18 @@ rcc_language_alias rcc_default_aliases[] = {
{ NULL, NULL}
};
+const char rcc_default_language_sn[] = "default";
+const char rcc_disabled_language_sn[] = "Off";
+const char rcc_english_language_sn[] = "en";
+const char rcc_disabled_engine_sn[] = "Off";
const char rcc_default_charset[] = "Default";
+
const char rcc_utf8_charset[] = "UTF-8";
const char rcc_engine_nonconfigured[] = "Default";
const char rcc_option_nonconfigured[] = "DEFAULT";
rcc_engine rcc_default_engine = {
- "Off", NULL, NULL, NULL, {NULL}
+ rcc_disabled_engine_sn, NULL, NULL, NULL, {NULL}
};
rcc_engine rcc_russian_engine = {
@@ -32,11 +37,11 @@ rcc_engine rcc_ukrainian_engine = {
rcc_language rcc_default_languages[RCC_MAX_LANGUAGES + 1];
rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
-{"default", {rcc_default_charset, NULL}, {
+{rcc_default_language_sn, {rcc_default_charset, NULL}, {
&rcc_default_engine,
NULL
}},
-{"off", {rcc_default_charset, NULL}, {
+{rcc_disabled_language_sn, {rcc_default_charset, NULL}, {
&rcc_default_engine,
NULL
}},
@@ -112,14 +117,28 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL };
rcc_option_value_name rcc_sn_learning[] = { "OFF", "ON", "RELEARN", "LEARN", NULL };
rcc_option_value_name rcc_sn_clo[] = { "ALL", "CONFIGURED_AND_AUTO", "CONFIGURED_ONLY", NULL };
+rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_ENGLISH", "FULL", NULL };
rcc_option_description rcc_option_descriptions[RCC_MAX_OPTIONS+1];
rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = {
+#ifdef HAVE_DB_H
{RCC_OPTION_LEARNING_MODE, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1 }, RCC_OPTION_TYPE_STANDARD, "LEARNING_MODE", rcc_sn_learning },
+#else
+ {RCC_OPTION_LEARNING_MODE, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1 }, RCC_OPTION_TYPE_INVISIBLE, "LEARNING_MODE", rcc_sn_learning },
+#endif /* HAVE_DB_H */
{RCC_OPTION_AUTODETECT_FS_NAMES, 1, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTODETECT_FS_NAMES", rcc_sn_boolean},
{RCC_OPTION_AUTODETECT_FS_TITLES, 1, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_INVISIBLE, "AUTODETECT_FS_TITLES", rcc_sn_boolean},
{RCC_OPTION_CONFIGURED_LANGUAGES_ONLY, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 2, 1}, RCC_OPTION_TYPE_INVISIBLE, "CONFIGURED_LANGUAGES_ONLY", rcc_sn_clo},
- {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "TRANSLATE", rcc_sn_boolean },
+#ifdef HAVE_ASPELL
+ {RCC_OPTION_AUTODETECT_LANGUAGE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTODETECT_LANGUAGE", rcc_sn_boolean},
+#else
+ {RCC_OPTION_AUTODETECT_LANGUAGE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_INVISIBLE, "AUTODETECT_LANGUAGE", rcc_sn_boolean},
+#endif
+#ifdef HAVE_LIBTRANSLATE
+ {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_STANDARD, "TRANSLATE", rcc_sn_translate },
+#else
+ {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_INVISIBLE, "TRANSLATE", rcc_sn_translate },
+#endif /* HAVE_LIBTRANSLATE */
{RCC_OPTION_AUTOENGINE_SET_CURRENT, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTOENGINE_SET_CURRENT", rcc_sn_boolean },
{RCC_MAX_OPTIONS}
};
diff --git a/src/rccconfig.h b/src/rccconfig.h
index b94a39b..8e794ba 100644
--- a/src/rccconfig.h
+++ b/src/rccconfig.h
@@ -6,6 +6,11 @@
#undef RCC_DEBUG
#define RCC_LOCALE_VARIABLE "LC_CTYPE"
+extern const char rcc_default_language_sn[];
+extern const char rcc_english_language_sn[];
+extern const char rcc_disabled_language_sn[];
+extern const char rcc_disabled_engine_sn[];
+
extern rcc_language_alias rcc_default_aliases[];
extern const char rcc_default_charset[];
extern const char rcc_utf8_charset[];
diff --git a/src/rccexternal.c b/src/rccexternal.c
index 16b3667..4a09948 100644
--- a/src/rccexternal.c
+++ b/src/rccexternal.c
@@ -153,7 +153,7 @@ int rccExternalConnect(unsigned char module) {
fd_set fdcon;
if (pid == (pid_t)-1) return -1;
-
+
sock = socket(PF_UNIX, SOCK_STREAM, 0);
if (sock<=0) return -1;
diff --git a/src/rcciconv.c b/src/rcciconv.c
index d9903de..93278a7 100644
--- a/src/rcciconv.c
+++ b/src/rcciconv.c
@@ -48,6 +48,11 @@ void rccIConvClose(rcc_iconv icnv) {
}
}
+int rccIConvGetError(rcc_iconv icnv) {
+ if ((!icnv)||(icnv->icnv == (iconv_t)-1)) return -1;
+ return 0;
+}
+
size_t rccIConvRecode(rcc_iconv icnv, char *outbuf, size_t outsize, const char *buf, size_t size) {
char *in_buf, *out_buf, err;
int in_left, out_left;
diff --git a/src/rcciconv.h b/src/rcciconv.h
index 0070696..1520534 100644
--- a/src/rcciconv.h
+++ b/src/rcciconv.h
@@ -8,6 +8,8 @@ struct rcc_iconv_t {
};
typedef struct rcc_iconv_t rcc_iconv_s;
+int rccIConvGetError(rcc_iconv icnv);
+
size_t rccIConvInternal(rcc_context ctx, rcc_iconv icnv, const char *buf, size_t len);
/**
diff --git a/src/rccspell.c b/src/rccspell.c
new file mode 100644
index 0000000..c54e267
--- /dev/null
+++ b/src/rccspell.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "rccspell.h"
+
+rcc_speller rccSpellerCreate(const char *lang) {
+#ifdef HAVE_ASPELL
+ rcc_speller rccspeller;
+ AspellSpeller *speller = NULL;
+ AspellConfig *config;
+ AspellCanHaveError *possible_err;
+
+ if (!lang) return NULL;
+
+ rccspeller = (rcc_speller)malloc(sizeof(rcc_speller_s));
+ if (!rccspeller) return rccspeller;
+
+ config = new_aspell_config();
+
+ if (config) {
+ if (aspell_config_replace(config, "encoding", "utf-8")&&aspell_config_replace(config, "master", lang)) {
+ possible_err = new_aspell_speller(config);
+ if (aspell_error_number(possible_err) == 0) {
+ speller = to_aspell_speller(possible_err);
+ }
+ }
+ delete_aspell_config(config);
+ }
+
+ rccspeller->speller = speller;
+ return rccspeller;
+#else
+ return NULL;
+#endif /* HAVE_ASPELL */
+}
+
+void rccSpellerFree(rcc_speller rccspeller) {
+#ifdef HAVE_ASPELL
+ if ((rccspeller)&&(rccspeller->speller))
+ delete_aspell_speller(rccspeller->speller);
+ free(rccspeller);
+#endif /* HAVE_ASPELL */
+}
+
+int rccSpellerGetError(rcc_speller rccspeller) {
+ if ((!rccspeller)||(!rccspeller->speller)) return -1;
+ return 0;
+}
+
+int rccSpellerSized(rcc_speller speller, const char *word, size_t len) {
+#ifdef HAVE_ASPELL
+ int res;
+
+ if (rccSpellerGetError(speller)) return 0;
+ res = aspell_speller_check(speller->speller, word, len?len:-1);
+ return res<0?0:res;
+#endif /* HAVE_ASPELL */
+ return 0;
+}
+
+int rccSpeller(rcc_speller speller, const char *word) {
+ return rccSpellerSized(speller, word, 0);
+}
diff --git a/src/rccspell.h b/src/rccspell.h
new file mode 100644
index 0000000..49e39f4
--- /dev/null
+++ b/src/rccspell.h
@@ -0,0 +1,29 @@
+#ifndef _RCC_SPELL_H
+#define _RCC_SPELL_H
+
+#include "../config.h"
+
+#ifdef HAVE_ASPELL
+#include <aspell.h>
+#endif /* HAVE_ASPELL */
+
+struct rcc_speller_t {
+#ifdef HAVE_ASPELL
+ struct AspellSpeller *speller;
+#else
+ void *speller;
+#endif /* HAVE_ASPELL */
+};
+
+typedef struct rcc_speller_t *rcc_speller;
+typedef struct rcc_speller_t rcc_speller_s;
+
+rcc_speller rccSpellerCreate(const char *lang);
+void rccSpellerFree(rcc_speller speller);
+
+int rccSpellerGetError(rcc_speller rccspeller);
+
+int rccSpellerSized(rcc_speller speller, const char *word, size_t len);
+int rccSpeller(rcc_speller speller, const char *word);
+
+#endif /* _RCC_SPELL_H */
diff --git a/src/rccstring.c b/src/rccstring.c
index d6c6805..9c4c19f 100644
--- a/src/rccstring.c
+++ b/src/rccstring.c
@@ -58,6 +58,14 @@ int rccStringFixID(rcc_string string, rcc_context ctx) {
return 0;
}
+int rccStringChangeID(rcc_string string, rcc_language_id language_id) {
+ if ((!string)&&(language_id != (rcc_language_id)-1)) return -1;
+
+ ((rcc_string_header*)string)->language_id = language_id;
+ return 0;
+}
+
+
void rccStringFree(rcc_string str) {
if (str) free(str);
}
diff --git a/src/rccstring.h b/src/rccstring.h
index 3c5d8d7..e9e9734 100644
--- a/src/rccstring.h
+++ b/src/rccstring.h
@@ -16,6 +16,7 @@ void rccStringFree(rcc_string str);
int rccStringSetLang(rcc_string string, const char *sn);
int rccStringFixID(rcc_string string, rcc_context ctx);
+int rccStringChangeID(rcc_string string, rcc_language_id language_id);
#ifdef HAVE_STRNLEN
# ifndef strnlen
diff --git a/src/rcctranslate.c b/src/rcctranslate.c
index 3bbd916..d7bb4e4 100644
--- a/src/rcctranslate.c
+++ b/src/rcctranslate.c
@@ -66,18 +66,22 @@ int rccTranslateSetTimeout(rcc_translate translate, unsigned long us) {
char *rccTranslate(rcc_translate translate, const char *buf) {
#ifdef HAVE_LIBTRANSLATE
- size_t i;
rcc_external_command_s resp;
size_t err, len;
char *buffer;
-
- if ((!translate)||(!buf)) return NULL;
+/*
+ size_t i;
+*/
+ if ((!translate)||(!buf)) return NULL;
+
+/*
if (!strcmp(translate->prefix.to, "en")) {
for (i=0;buf[i];i++)
if ((unsigned char)buf[i]>0x7F) break;
if (!buf[i]) return NULL;
}
+*/
if (translate->sock == -1) {
translate->sock = rccExternalConnect(RCC_EXTERNAL_MODULE_LIBRTRANSLATE);
diff --git a/src/recode.c b/src/recode.c
index c44095c..7e12343 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -2,6 +2,8 @@
#include <stdlib.h>
#include <string.h>
+#include "../config.h"
+
#include "internal.h"
#include "rcciconv.h"
#include "fs.h"
@@ -10,19 +12,140 @@
#include "rccconfig.h"
#include "rccdb4.h"
#include "rcctranslate.h"
+#include "rccspell.h"
+
+#define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a'))))
+#define RCC_REQUIRED_PROBABILITY 0.66
+
+rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) {
+ rcc_speller speller;
+ unsigned long i, nlanguages;
+ rcc_language_config config, config0 = NULL;
+ rcc_string recoded;
+ unsigned char *utf8;
+ size_t j, mode;
+ unsigned long words, english, result;
+ unsigned char english_mode, english_word = 1;
+ rcc_language_id english_lang = (rcc_language_id)-1;
+ double res, english_res = 0;
+ rcc_option_value usedb4;
+
+
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+
+ if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
+ recoded = rccDb4GetKey(ctx->db4ctx, buf, len);
+ if (recoded) {
+ if (rccStringFixID(recoded, ctx)) free(recoded);
+ else {
+ english_lang = rccStringGetLanguage(recoded);
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return english_lang;
+ }
+ }
+ }
+
+ if (!rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) return (rcc_language_id)-1;
+
+ nlanguages = ctx->n_languages;
+
+ for (i=0;i<nlanguages;i++) {
+ config = rccGetUsableConfig(ctx, (rcc_language_id)i);
+ if (!config) continue;
+
+ if (i) {
+ if (config==config0) continue;
+ } else config0=config;
+
+ speller = rccConfigGetSpeller(config);
+ if (rccSpellerGetError(speller)) continue;
+
+ recoded = rccConfigSizedFrom(config, class_id, buf, len);
+ if (!recoded) continue;
+
+ if (!strcasecmp(config->language->sn, rcc_english_language_sn)) english_mode = 1;
+ else english_mode = 0;
+
+ utf8 = (char*)rccStringGetString(recoded);
+ for (result=0,english=0,words=0,mode=0,j=0;utf8[j];j++) {
+ if (isSpace(utf8[j])) {
+ if (mode) {
+ if ((!english_mode)&&(english_word)) english++;
+ result+=rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0;
+ words++;
+ mode = 0;
+ } else continue;
+ } else {
+ if (mode) {
+ if (utf8[j]>0x7F) english_word = 0;
+ } else {
+ mode = j + 1;
+ english_word = 1;
+ }
+ }
+ }
+ if (mode) {
+ result+=rccSpeller(speller, utf8 + mode - 1)?1:0;
+ words++;
+ }
+
+ if (english_mode) {
+ english_res = 1.*result/words;
+ english_lang = (rcc_language_id)i;
+ } else if (words) {
+ res = 1.*result/words;
+ if (res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return (rcc_language_id)i;
+ }
+ if (words > english) {
+ res = 1.*(result - english)/(words - english);
+ if (res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return (rcc_language_id)i;
+ }
+ }
+ }
+
+ free(recoded);
+ }
+
+ if (english_res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) {
+ *retstring = rccCreateString(english_lang, buf, len);
+ }
+ return english_lang;
+ }
+
+ return (rcc_language_id)-1;
+}
+rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return -1;
+ }
+
+ return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL);
+}
-static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, const char *buf, int len) {
+rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) {
+ rcc_context ctx;
rcc_class_type class_type;
rcc_engine_ptr engine;
- if (!buf) return (rcc_autocharset_id)-1;
+ if ((!buf)||(!config)) return (rcc_autocharset_id)-1;
+
+ ctx = config->ctx;
class_type = rccGetClassType(ctx, class_id);
if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) {
- engine = rccGetCurrentEnginePointer(ctx);
+ engine = rccConfigGetCurrentEnginePointer(config);
if ((!engine)||(!engine->func)) return (rcc_autocharset_id)-1;
return engine->func(&ctx->engine_ctx, buf, len);
}
@@ -30,16 +153,26 @@ static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, c
return (rcc_autocharset_id)-1;
}
+int rccDetectCharset(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return -1;
+ }
+
+ return rccConfigDetectCharset(ctx->current_config, class_id, buf, len);
+}
+
+
rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
int err;
size_t ret;
- rcc_language_id language_id;
+ rcc_language_id language_id, detected_language_id;
rcc_autocharset_id charset_id;
rcc_iconv icnv = NULL;
rcc_string result;
rcc_option_value usedb4;
const char *charset;
-
+
if (!ctx) {
if (rcc_default_ctx) ctx = rcc_default_ctx;
else return NULL;
@@ -52,10 +185,11 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
language_id = rccGetCurrentLanguage(ctx);
if (language_id == (rcc_language_id)-1) return NULL;
- if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL;
+ if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL;
- usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+/*
if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
result = rccDb4GetKey(ctx->db4ctx, buf, len);
if (result) {
@@ -63,11 +197,22 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
else return result;
}
}
+
+ if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) {
+ detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len);
+ if (detected_language_id != (rcc_language_id)-1)
+ language_id = detected_language_id;
+ }
+*/
+
+ detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result);
+ if (detected_language_id != (rcc_language_id)-1) return result;
+
err = rccConfigure(ctx);
if (err) return NULL;
- charset_id = rccIConvAuto(ctx, class_id, buf, len);
+ charset_id = rccDetectCharset(ctx, class_id, buf, len);
if (charset_id != (rcc_autocharset_id)-1) {
icnv = ctx->iconv_auto[charset_id];
if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) {
@@ -105,6 +250,9 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
rcc_language_id language_id;
rcc_language_id current_language_id;
rcc_class_type class_type;
+ rcc_option_value translate;
+ const char *langname;
+ unsigned char english_source;
rcc_iconv icnv;
if (!ctx) {
@@ -127,33 +275,60 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
if (err) return NULL;
class_type = rccGetClassType(ctx, class_id);
- if ((class_type != RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) {
+ translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE);
+
+ langname = rccGetLanguageName(ctx, language_id);
+ if (strcasecmp(langname, rcc_english_language_sn)) english_source = 0;
+ else english_source = 1;
+
+ if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) {
current_language_id = rccGetCurrentLanguage(ctx);
if (current_language_id != language_id) {
if ((config->trans)&&(config->translang != current_language_id)) {
rccTranslateClose(config->trans);
config->trans = NULL;
}
- if (!config->trans) {
- config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id));
- config->translang = current_language_id;
+
+ if (translate != RCC_OPTION_TRANSLATE_TO_ENGLISH) {
+ if (!config->trans) {
+ config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id));
+ config->translang = current_language_id;
+ }
+
+ if (config->trans) {
+ translated = rccTranslate(config->trans, utfstring);
+ if (translated) {
+ language_id = current_language_id;
+
+ config = rccGetConfig(ctx, language_id);
+ if (!config) {
+ free(translated);
+ return NULL;
+ }
+
+ err = rccConfigConfigure(config);
+ if (err) {
+ free(translated);
+ return NULL;
+ }
+ }
+ }
}
- if (config->trans) {
- translated = rccTranslate(config->trans, utfstring);
- if (translated) {
- language_id = current_language_id;
+
+ if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) {
+ puts("entrans");
+ if (!config->entrans) {
+ config->entrans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rcc_english_language_sn);
+ }
+ if (config->entrans) {
+ translated = rccTranslate(config->entrans, utfstring);
+
config = rccGetConfig(ctx, language_id);
- if (!config) {
- free(translated);
- return NULL;
- }
+ if (!config) return translated;
err = rccConfigConfigure(config);
- if (err) {
- free(translated);
- return NULL;
- }
+ if (err) return translated;
}
}
}
@@ -183,7 +358,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
icnv = config->iconv_to[class_id];
if (icnv) {
- newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, newlen);
+ newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, translated?0:newlen);
if (translated) free(translated);
if (newlen == (size_t)-1) return NULL;
@@ -237,7 +412,7 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const
err = rccConfigure(ctx);
if (err) return NULL;
- from_charset_id = rccIConvAuto(ctx, from, buf, len);
+ from_charset_id = rccDetectCharset(ctx, from, buf, len);
if (from_charset_id != (rcc_charset_id)-1) {
from_charset = rccGetAutoCharsetName(ctx, from_charset_id);
to_charset = rccGetCurrentCharsetName(ctx, to);
@@ -385,13 +560,15 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char
return extracted;
}
-/* Convert to class_id from Charset */
+/* Convert to class_id from Charset.
+Usage of this function assuming the knowledge about the incoming string.
+The charset as well as the language. So no detection (DB4,Aspell) of language
+will be performed. */
char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) {
size_t res;
rcc_iconv icnv;
rcc_string str;
char *extracted;
-
if (!charset) return NULL;
diff --git a/ui/rccnames.c b/ui/rccnames.c
index b6d08dd..0e4f586 100644
--- a/ui/rccnames.c
+++ b/ui/rccnames.c
@@ -32,6 +32,7 @@ rcc_name rcc_default_language_names_embeded[RCC_MAX_LANGUAGES+1] = {
rcc_option_value_name rcc_default_option_boolean_names[] = { "Off", "On", NULL };
rcc_option_value_name rcc_default_option_learning_names[] = { "Off", "On", "Relearn", "Learn", NULL };
rcc_option_value_name rcc_default_option_clo_names[] = { "All Languages", "Configured / AutoEngine", "Configured Only", NULL };
+rcc_option_value_name rcc_default_option_translate_names[] = { "Off", "Translate to English", "Skip English Translation", "Full", NULL };
rcc_option_name rcc_default_option_names[RCC_MAX_OPTIONS+1];
rcc_option_name rcc_default_option_names_embeded[RCC_MAX_OPTIONS+1] = {
@@ -39,8 +40,9 @@ rcc_option_name rcc_default_option_names_embeded[RCC_MAX_OPTIONS+1] = {
{ RCC_OPTION_AUTODETECT_FS_NAMES, "Autodetect File Names", rcc_default_option_boolean_names },
{ RCC_OPTION_AUTODETECT_FS_TITLES, "Autodetect FS Titles", rcc_default_option_boolean_names },
{ RCC_OPTION_CONFIGURED_LANGUAGES_ONLY, "Enabled Languages", rcc_default_option_clo_names },
- { RCC_OPTION_TRANSLATE, "Translate Text", rcc_default_option_boolean_names },
{ RCC_OPTION_AUTOENGINE_SET_CURRENT, "AutoEngine Set Current Encoding", rcc_default_option_boolean_names },
+ { RCC_OPTION_AUTODETECT_LANGUAGE, "Autodetect Language", rcc_default_option_boolean_names },
+ { RCC_OPTION_TRANSLATE, "Translate Text", rcc_default_option_translate_names },
{ RCC_MAX_OPTIONS }
};