summaryrefslogtreecommitdiff
path: root/textproc/libtextcat/files/patch-src_fingerprint.c
diff options
context:
space:
mode:
authorThierry Thomas <thierry@FreeBSD.org>2007-08-23 22:13:35 +0000
committerThierry Thomas <thierry@FreeBSD.org>2007-08-23 22:13:35 +0000
commit3a00ffa77e26933c3406b9a99a9347e47afa47ca (patch)
tree4d4227847615ed17c93dfea8ccfaee024231eafa /textproc/libtextcat/files/patch-src_fingerprint.c
parent- Update to 20070822 (diff)
Import patches (imported from OpenOffice.org 2.3 by Fedora, at least in
part). These patches, released under a BSD license, seem to improve the accuracy of language detection, especially those that don't have a Latin script.
Notes
Notes: svn path=/head/; revision=198222
Diffstat (limited to 'textproc/libtextcat/files/patch-src_fingerprint.c')
-rw-r--r--textproc/libtextcat/files/patch-src_fingerprint.c164
1 files changed, 164 insertions, 0 deletions
diff --git a/textproc/libtextcat/files/patch-src_fingerprint.c b/textproc/libtextcat/files/patch-src_fingerprint.c
new file mode 100644
index 000000000000..a01722c160b0
--- /dev/null
+++ b/textproc/libtextcat/files/patch-src_fingerprint.c
@@ -0,0 +1,164 @@
+--- src/fingerprint.c.orig Thu May 22 13:32:43 2003
++++ src/fingerprint.c Thu Aug 23 22:47:07 2007
+@@ -63,6 +63,10 @@
+ * - put table/heap datastructure in a separate file.
+ */
+
++#ifndef _UTF8_
++#define _UTF8_
++#endif
++
+ #include "config.h"
+ #include <stdio.h>
+ #ifdef HAVE_STDLIB_H
+@@ -80,10 +84,12 @@
+ #include "wg_mempool.h"
+ #include "constants.h"
+
++#include "utf8misc.h"
+
+ #define TABLESIZE (1<<TABLEPOW)
+ #define TABLEMASK ((TABLESIZE)-1)
+
++
+ typedef struct {
+
+ sint2 rank;
+@@ -134,29 +140,14 @@
+ }
+
+
+-/* checks if n-gram lex is a prefix of key and of length len */
+-inline int issame( char *lex, char *key, int len )
+-{
+- int i;
+- for (i=0; i<len; i++) {
+- if ( key[i] != lex[i] ) {
+- return 0;
+- }
+- }
+- if ( lex[i] != 0 ) {
+- return 0;
+- }
+- return 1;
+-}
+-
+
+ /* increases frequency of ngram(p,len) */
+-static inline int increasefreq( table_t *t, char *p, int len )
+-{
+- uint4 hash = simplehash( p, len ) & TABLEMASK;
++static int increasefreq( table_t *t, char *p, int len )
++{
++ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+-
+- while ( entry ) {
++
++ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+@@ -168,7 +159,7 @@
+ }
+
+ /*** Not found, so create ***/
+- entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
++ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+@@ -181,12 +172,12 @@
+ #if 0
+
+ /* looks up ngram(p,len) */
+-static entry_t *findfreq( table_t *t, char *p, int len )
+-{
+- uint4 hash = simplehash( p, len ) & TABLEMASK;
++static entry_t *findfreq( table_t *t, char *p, int len )
++{
++ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+-
+- while ( entry ) {
++
++ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+@@ -219,7 +210,7 @@
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y) ((x).cnt < (y).cnt)
+
+-inline static void siftup( table_t *t, unsigned int child )
++static void siftup( table_t *t, unsigned int child )
+ {
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+@@ -241,7 +232,7 @@
+ }
+
+
+-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+@@ -458,21 +449,27 @@
+ return dest;
+ }
+
+-
++/**
++* this function extract all n-gram from past buffer and put them into the table "t"
++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
++*/
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ char n[MAXNGRAMSIZE+1];
+ const char *p = buf;
+ int i;
++ int pointer = 0;
+
+ /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+- for (;;p++) {
++ while(1) {
+
+- const char *q = p;
++ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+- *m++ = *q++;
++ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
++ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
++ m += decay; /*[modified]*/
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+@@ -482,19 +479,22 @@
+ }
+
+ /*** Let the compiler unroll this ***/
+- for ( i=2; i<=MAXNGRAMSIZE; i++) {
++ for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
+
+- *m++ = *q;
++ decay = charcopy(q, m); /*[modified] like above*/
++ m += decay;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+- q++;
++ q += decay;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
++
++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
+ }
+ return;
+ }