summaryrefslogtreecommitdiffstats
path: root/src/librcd.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/librcd.c')
-rw-r--r--src/librcd.c263
1 files changed, 263 insertions, 0 deletions
diff --git a/src/librcd.c b/src/librcd.c
new file mode 100644
index 0000000..2a99378
--- /dev/null
+++ b/src/librcd.c
@@ -0,0 +1,263 @@
+#include <stdio.h>
+
+#include "librcd.h"
+
+#define NF_VALUE -2
+#define max(a,b) ((a>b)?a:b)
+#define min(a,b) ((a<b)?a:b)
+#define bit(i) (1<<i)
+#define STRNLEN(str,n) (n?strnlen(str,n):strlen(str))
+
+
+typedef struct lng_stat2 {
+ unsigned char a;
+ unsigned char b;
+ double rate;
+ double srate;
+ double erate;
+} lng_stat2;
+
+#include "russian_table.h"
+
+
+static int end_symbol(char ch) {
+ if (ch=='\r'||ch=='\n'||ch==0||ch==' '||ch=='\t'||ch==','||ch=='.'||ch=='!'||ch=='?'||ch==';'||ch=='-'||ch==':'||ch=='"'||ch=='\''||ch==')') return 1;
+ return 0;
+}
+
+static int start_symbol(char ch) {
+ if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1;
+ return 0;
+}
+
+typedef const struct lng_stat2 *lng_stat2_ptr;
+
+static void bfind(const unsigned char *a, lng_stat2_ptr *w, lng_stat2_ptr *k, lng_stat2_ptr *al) {
+ const struct lng_stat2 *winptr, *koiptr,*altptr;
+ int ki,wi,ai,d,ws=0,ks=0,as=0;
+ d=npow2>>1;
+ wi=d;
+ ki=d;
+ ai=d;
+ winptr=0;
+ koiptr=0;
+ altptr=0;
+ do{
+ d>>=1;
+
+ if(!ws){
+ if (wi>indexes2) wi-=d;
+ else {
+ winptr=enc_win+wi-1;
+ if(a[0]==winptr->a){
+ if(a[1]==winptr->b){
+ ws=1;
+ }else if(a[1]<winptr->b){
+ wi-=d;
+ }else{ //b>win[wi].b
+ wi+=d;
+ }
+ }else if(a[0]<winptr->a){
+ wi-=d;
+ }else{ //a>win[wi].a
+ wi+=d;
+ }
+ }
+ }
+ if(!ks){
+ if (ki>indexes2) ki-=d;
+ else {
+ koiptr=enc_koi+ki-1;
+ if(a[0]==koiptr->a){
+ if(a[1]==koiptr->b){
+ ks=1;
+ }else if(a[1]<koiptr->b){
+ ki-=d;
+ }else{ //b>win[wi].b
+ ki+=d;
+ }
+ }else if(a[0]<koiptr->a){
+ ki-=d;
+ }else{ //a>win[wi].a
+ ki+=d;
+ }
+ }
+ }
+ if(!as){
+ if (ai>indexes2) ai-=d;
+ else {
+ altptr=enc_alt+ai-1;
+ if(a[0]==altptr->a){
+ if(a[1]==altptr->b){
+ as=1;
+ }else if(a[1]<altptr->b){
+ ai-=d;
+ }else{ //b>win[wi].b
+ ai+=d;
+ }
+ }else if(a[0]<altptr->a){
+ ai-=d;
+ }else{ //a>win[wi].a
+ ai+=d;
+ }
+ }
+ }
+ }while(d);
+ if (ws) *w=winptr;
+ else *w=NULL;
+ if (ks) *k=koiptr;
+ else *k=NULL;
+ if (as) *al=altptr;
+ else *al=NULL;
+}
+
+static double calculate(double s, double m, double e) {
+ return s+m+e;
+}
+
+static int is_win_charset2(const unsigned char *txt, int len){
+ const struct lng_stat2 *winptr, *koiptr,*altptr;
+ double winstep,koistep,altstep,winestep,koiestep,altestep,winsstep,koisstep,altsstep;
+ double winstat=0,koistat=0,altstat=0,winestat=0,koiestat=0,altestat=0,winsstat=0,koisstat=0,altsstat=0;
+ long j;
+
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,"Word: %s\n",txt);
+#endif
+ for(j=0;j<len-1;j++){
+ //skip bottom half of table
+ if(txt[j]<128 || txt[j+1]<128) continue;
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,"Pair: %c%c",txt[j],txt[j+1]);
+#endif
+ bfind(txt+j,&winptr,&koiptr,&altptr);
+
+ if ((j==0)||(start_symbol(txt[j-1]))) {
+ if (winptr) winsstep=winptr->srate;
+ else winsstep=NF_VALUE;
+ if (koiptr) koisstep=koiptr->srate;
+ else koisstep=NF_VALUE;
+ if (altptr) altsstep=altptr->srate;
+ else altsstep=NF_VALUE;
+ winestep=0;
+ koiestep=0;
+ altestep=0;
+ winstep=0;
+ koistep=0;
+ altstep=0;
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,", Win %lf, Koi %lf, Alt: %lf\n",winsstep,koisstep,altsstep);
+#endif
+ } else if ((j==len-2)||(end_symbol(txt[j+2]))) {
+ if (winptr) winestep=winptr->erate;
+ else winestep=NF_VALUE;
+ if (koiptr) koiestep=koiptr->erate;
+ else koiestep=NF_VALUE;
+ if (altptr) altestep=altptr->erate;
+ else altestep=NF_VALUE;
+ winsstep=0;
+ koisstep=0;
+ altsstep=0;
+ winstep=0;
+ koistep=0;
+ altstep=0;
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,", Win %lf, Koi %lf, Alt %lf\n",winestep,koiestep,altestep);
+#endif
+ } else {
+ if (winptr) winstep=winptr->rate;
+ else winstep=NF_VALUE;
+ if (koiptr) koistep=koiptr->rate;
+ else koistep=NF_VALUE;
+ if (altptr) altstep=altptr->rate;
+ else altstep=NF_VALUE;
+ winsstep=0;
+ winestep=0;
+ koisstep=0;
+ koiestep=0;
+ altsstep=0;
+ altestep=0;
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,", Win %lf, Koi %lf, Alt %lf\n",winstep,koistep,altstep);
+#endif
+ }
+
+ winstat+=winstep;
+ koistat+=koistep;
+ altstat+=altstep;
+ winsstat+=winsstep;
+ koisstat+=koisstep;
+ altsstat+=altsstep;
+ winestat+=winestep;
+ koiestat+=koiestep;
+ altestat+=altestep;
+ }
+
+#ifdef _AUTO_DEBUG
+ fprintf(stderr,"Start. Win: %lf, Koi: %lf, Alt: %lf\n",winsstat,koisstat,altsstat);
+ fprintf(stderr,"Middle. Win: %lf, Koi: %lf, Alt: %lf\n",winstat,koistat,altstat);
+ fprintf(stderr,"End. Win: %lf, Koi: %lf, Alt: %lf\n",winestat,koiestat,altestat);
+ fprintf(stderr,"Final. Win: %lf, Koi: %lf, Alt: %lf\n",calculate(winsstat,winstat,winestat),calculate(koisstat,koistat,koiestat),calculate(altsstat,altstat,altestat));
+#endif
+ if ((calculate(altsstat,altstat,altestat)>calculate(koisstat,koistat,koiestat))&&(calculate(altsstat,altstat,altestat)>calculate(winsstat,winstat,winestat))) return 3;
+ if (calculate(koisstat,koistat,koiestat)>calculate(winsstat,winstat,winestat)) return 1;
+ return 0;
+}
+
+
+static int check_utf8(const unsigned char *buf, int len) {
+ long i,j;
+ int bytes=0,rflag=0;
+ unsigned char tmp;
+ int res=0;
+
+ for (i=0;i<len;i++) {
+ if (buf[i]<128) continue;
+
+ if (bytes>0) {
+ if ((buf[i]&0xC0)==0x80) {
+ if (rflag) {
+ tmp=buf[i]&0x3F;
+ // Russian is 0x410-0x44F
+ if ((rflag==1)&&(tmp>=0x10)) res++;
+ else if ((rflag==2)&&(tmp<=0x0F)) res++;
+ }
+ bytes--;
+ } else {
+ res--;
+ bytes=1-bytes;
+ rflag=0;
+ }
+ } else {
+ for (j=6;j>=0;j--)
+ if ((buf[i]&bit(j))==0) break;
+
+ if ((j==0)||(j==6)) {
+ if ((j==6)&&(bytes<0)) bytes++;
+ else res--;
+ continue;
+ }
+ bytes=6-j;
+ if (bytes==1) {
+ // Cyrrilic D0-D3, Russian - D0-D1
+ if (buf[i]==0xD0) rflag=1;
+ else if (buf[i]==0xD1) rflag=2;
+ }
+ }
+
+ if ((buf[i]==0xD0)||(buf[i]==0xD1)) {
+ if (i+1==len) break;
+
+ }
+ }
+ return res;
+}
+
+
+enum russian_charsets get_russian_charset(const char *buf,int len) {
+ long l;
+
+ l = STRNLEN(buf,len);
+ if (check_utf8(buf,l)>1) return RUSSIAN_CHARSET_UTF8;
+ return is_win_charset2(buf,l);
+}