Import patches (imported from OpenOffice.org 2.3 by Fedora, at least in

part). These patches, released under a BSD license, seem to improve the accuracy of language detection, especially those that don't have a Latin script.
author: Thierry Thomas <thierry@FreeBSD.org> 2007-08-23 22:13:35 +0000
committer: Thierry Thomas <thierry@FreeBSD.org> 2007-08-23 22:13:35 +0000
commit: 3a00ffa77e26933c3406b9a99a9347e47afa47ca (patch)
tree: 4d4227847615ed17c93dfea8ccfaee024231eafa /textproc/libtextcat/files/patch-src_fingerprint.c
parent: - Update to 20070822 (diff)
1 files changed, 164 insertions, 0 deletions
diff --git a/textproc/libtextcat/files/patch-src_fingerprint.c b/textproc/libtextcat/files/patch-src_fingerprint.c
new file mode 100644
index 000000000000..a01722c160b0
--- /dev/null
+++ b/textproc/libtextcat/files/patch-src_fingerprint.c
@@ -0,0 +1,164 @@
+--- src/fingerprint.c.orig	Thu May 22 13:32:43 2003
++++ src/fingerprint.c	Thu Aug 23 22:47:07 2007
+@@ -63,6 +63,10 @@
+  * - put table/heap datastructure in a separate file.
+  */
+ 
++#ifndef _UTF8_
++#define _UTF8_
++#endif
++
+ #include "config.h"
+ #include <stdio.h>
+ #ifdef HAVE_STDLIB_H
+@@ -80,10 +84,12 @@
+ #include "wg_mempool.h"
+ #include "constants.h"
+ 
++#include "utf8misc.h"
+ 
+ #define TABLESIZE  (1<<TABLEPOW)
+ #define TABLEMASK  ((TABLESIZE)-1)
+ 
++
+ typedef struct {
+ 
+ 	sint2 rank;
+@@ -134,29 +140,14 @@
+ }
+ 
+ 
+-/* checks if n-gram lex is a prefix of key and of length len */
+-inline int issame( char *lex, char *key, int len )
+-{
+-	int i;
+-	for (i=0; i<len; i++) {
+-		if ( key[i] != lex[i] ) {
+-			return 0;
+-		}
+-	}
+-	if ( lex[i] != 0 ) {
+-		return 0;
+-	}
+-	return 1;
+-}
+-
+ 
+ /* increases frequency of ngram(p,len) */
+-static inline int increasefreq( table_t *t, char *p, int len ) 
+-{	
+-	uint4 hash = simplehash( p, len ) & TABLEMASK;				
++static int increasefreq( table_t *t, char *p, int len )
++{
++	uint4 hash = simplehash( p, len ) & TABLEMASK;
+ 	entry_t *entry = t->table[ hash ];
+-	
+-	while ( entry ) {				
++
++	while ( entry ) {
+ 		if ( issame( entry->str, p, len ) ) {
+ 			/*** Found it! ***/
+ 			entry->cnt++;
+@@ -168,7 +159,7 @@
+ 	}
+ 
+ 	/*** Not found, so create ***/
+-	entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
++        entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
+ 	strcpy( entry->str, p );
+ 	entry->cnt = 1;
+ 
+@@ -181,12 +172,12 @@
+ #if 0
+ 
+ /* looks up ngram(p,len) */
+-static entry_t *findfreq( table_t *t, char *p, int len ) 
+-{	
+-	uint4 hash = simplehash( p, len ) & TABLEMASK;				
++static entry_t *findfreq( table_t *t, char *p, int len )
++{
++	uint4 hash = simplehash( p, len ) & TABLEMASK;
+ 	entry_t *entry = t->table[ hash ];
+-	
+-	while ( entry ) {				
++
++	while ( entry ) {
+ 		if ( issame( entry->str, p, len ) ) {
+ 			return entry;
+ 		}
+@@ -219,7 +210,7 @@
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y)    ((x).cnt < (y).cnt)
+ 
+-inline static void siftup( table_t *t, unsigned int child )
++static void siftup( table_t *t, unsigned int child )
+ {
+ 	entry_t *heap = t->heap;
+ 	unsigned int parent = (child-1) >> 1;
+@@ -241,7 +232,7 @@
+ }
+ 
+ 
+-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ 	entry_t *heap = t->heap;
+ 	unsigned int child = parent*2 + 1;
+@@ -458,21 +449,27 @@
+ 	return dest;
+ }
+ 
+-
++/**
++* this function extract all n-gram from past buffer and put them into the table "t"
++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
++*/
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ 	char n[MAXNGRAMSIZE+1];
+ 	const char *p = buf;
+ 	int i;
++        int pointer = 0;
+ 
+ 	/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+-	for (;;p++) {
++	while(1) {
+ 
+-		const char *q = p;
++     const char *q = &p[pointer];   /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
+ 		char *m = n;
+ 
+ 		/*** First char may be an underscore ***/
+-		*m++ = *q++;
++                int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
++                q = &(p[pointer+decay]);    /*[modified] the old copying method do not manage multi-character symbols*/
++                m += decay; /*[modified]*/
+ 		*m = '\0';
+ 
+ 		increasefreq( t, n, 1 );
+@@ -482,19 +479,22 @@
+ 		}
+ 
+ 		/*** Let the compiler unroll this ***/
+-		for ( i=2; i<=MAXNGRAMSIZE; i++) {
++		for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
+ 
+-			*m++ = *q;
++                        decay = charcopy(q, m); /*[modified] like above*/
++                        m += decay;
+ 			*m = '\0';
+ 
+ 			increasefreq( t, n, i );
+ 
+ 			if ( *q == '_' ) break;
+-			q++;
++                        q += decay;
+ 			if ( *q == '\0' ) {
+ 				return;
+ 			}
+ 		}
++
++  pointer = nextcharstart(p,pointer);   /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
+ 	}
+ 	return;
+ }
author	Thierry Thomas <thierry@FreeBSD.org>	2007-08-23 22:13:35 +0000
committer	Thierry Thomas <thierry@FreeBSD.org>	2007-08-23 22:13:35 +0000
commit	3a00ffa77e26933c3406b9a99a9347e47afa47ca (patch)
tree	4d4227847615ed17c93dfea8ccfaee024231eafa /textproc/libtextcat/files/patch-src_fingerprint.c
parent	- Update to 20070822 (diff)