summaryrefslogtreecommitdiff
path: root/mail/spamprobe
diff options
context:
space:
mode:
authorMatthew N. Dodd <mdodd@FreeBSD.org>2003-04-29 00:18:11 +0000
committerMatthew N. Dodd <mdodd@FreeBSD.org>2003-04-29 00:18:11 +0000
commitb9b33f4d05c03795553e399fa0a580f2123f33c1 (patch)
tree601c23545b77d93974686fcad79999bf55cc8954 /mail/spamprobe
parentUpdate to version 0.20. (diff)
Update to 0.8b
Diffstat (limited to 'mail/spamprobe')
-rw-r--r--mail/spamprobe/Makefile11
-rw-r--r--mail/spamprobe/distinfo2
-rw-r--r--mail/spamprobe/files/Makefile.export0_64
-rw-r--r--mail/spamprobe/files/patch-MessageFactory.cc34
-rw-r--r--mail/spamprobe/files/patch-md576
-rw-r--r--mail/spamprobe/files/spamprobe.1321
-rw-r--r--mail/spamprobe/pkg-message (renamed from mail/spamprobe/files/post-install-notes)0
7 files changed, 41 insertions, 407 deletions
diff --git a/mail/spamprobe/Makefile b/mail/spamprobe/Makefile
index 8b6b72c3dd91..8d5bba628453 100644
--- a/mail/spamprobe/Makefile
+++ b/mail/spamprobe/Makefile
@@ -6,16 +6,16 @@
#
PORTNAME= spamprobe
-PORTVERSION= 0.7c
+PORTVERSION= 0.8b
CATEGORIES= mail
MASTER_SITES= ${MASTER_SITE_SOURCEFORGE}
MASTER_SITE_SUBDIR=${PORTNAME}
-LIB_DEPENDS= db3.3:${PORTSDIR}/databases/db3
-
MAINTAINER= mdodd@freebsd.org
COMMENT= Spam detector using Bayesian analysis of word counts
+LIB_DEPENDS= db4:${PORTSDIR}/databases/db4
+
MAKEFILE= ${FILESDIR}/Makefile
MAKE_ENV+= FILESDIR="${FILESDIR}"
@@ -25,12 +25,9 @@ post-build:
@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
${FILESDIR}/Makefile.export0_6 clean all
-post-extract:
- @${RM} -rf ${WRKSRC}/thirdparty
-
post-install:
@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
${FILESDIR}/Makefile.export0_6 install
- @${CAT} ${FILESDIR}/post-install-notes
+ @${CAT} ${PKGMESSAGE}
.include <bsd.port.post.mk>
diff --git a/mail/spamprobe/distinfo b/mail/spamprobe/distinfo
index 92ea4bc7d06f..dd9880fe4eb2 100644
--- a/mail/spamprobe/distinfo
+++ b/mail/spamprobe/distinfo
@@ -1 +1 @@
-MD5 (spamprobe-0.7c.tar.gz) = 51e568a3bd908ca629537bb0f9acde8c
+MD5 (spamprobe-0.8b.tar.gz) = a5ddc25dd2d116f3e6f346b027ae034f
diff --git a/mail/spamprobe/files/Makefile.export0_6 b/mail/spamprobe/files/Makefile.export0_6
index bbd5cfd126d4..4b025b2aaf53 100644
--- a/mail/spamprobe/files/Makefile.export0_6
+++ b/mail/spamprobe/files/Makefile.export0_6
@@ -1,10 +1,10 @@
-# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.2 2002-10-08 23:48:39 mi Exp $
+# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.3 2003-04-29 00:18:11 mdodd Exp $
#
PREFIX?= /usr/local
BINDIR= ${PREFIX}/bin
NOMAN=
PROG_CXX= spamprobe-export_0.6
-CXXFLAGS+= -Wall -DUSE_DBM
+CXXFLAGS+= -Wall -DUSE_DBM -DNDEBUG
SRCS= File.cc export0_6.cc
.include <bsd.prog.mk>
diff --git a/mail/spamprobe/files/patch-MessageFactory.cc b/mail/spamprobe/files/patch-MessageFactory.cc
new file mode 100644
index 000000000000..941ab4596ac8
--- /dev/null
+++ b/mail/spamprobe/files/patch-MessageFactory.cc
@@ -0,0 +1,34 @@
+--- MessageFactory.cc.orig Tue Mar 11 07:38:41 2003
++++ MessageFactory.cc Tue Mar 11 07:51:38 2003
+@@ -28,7 +28,7 @@
+ // http://www.cooldevtools.com/qpl.html
+ //
+
+-#include <strstream>
++#include <sstream>
+ #include "Tokenizer.h"
+ #include "MessageFactory.h"
+ #include "RegularExpression.h"
+@@ -50,11 +50,11 @@
+ MessageFactory::MessageFactory()
+ : m_minWordLength(2),
+ m_maxWordLength(90),
++ m_phraser(new PhraseBuilder(2)),
+ m_replaceNonAsciiChars(true),
+ m_nonAsciiChar('z'),
+ m_removeHTML(true),
+- m_headersToInclude(NORMAL_HEADERS),
+- m_phraser(new PhraseBuilder(2))
++ m_headersToInclude(NORMAL_HEADERS)
+ {
+ }
+
+@@ -299,7 +299,7 @@
+ text += ' ';
+ } else if (entity[0] == '#') {
+ int code = 0;
+- istrstream in(entity.c_str() + 1);
++ istringstream in(entity.c_str() + 1);
+ in >> code;
+ text += (char)code;
+ } else {
diff --git a/mail/spamprobe/files/patch-md5 b/mail/spamprobe/files/patch-md5
deleted file mode 100644
index 8ae1e7847c8b..000000000000
--- a/mail/spamprobe/files/patch-md5
+++ /dev/null
@@ -1,76 +0,0 @@
---- MimeMessageReader.h Thu Sep 19 12:15:38 2002
-+++ MimeMessageReader.h Wed Sep 25 09:19:55 2002
-@@ -34,4 +34,7 @@
- #include "MimeHeader.h"
-+#include <sys/types.h>
-+#include <md5.h>
-+#define MD5_DIGEST_LENGTH 16
-
--class md5_state_s;
-+typedef unsigned char md5_digest_t[MD5_DIGEST_LENGTH*2 + 1];
-
-@@ -64,3 +65,3 @@
-
-- const string &getMD5Digest();
-+ const md5_digest_t &getMD5Digest();
-
-@@ -105,4 +106,4 @@
- vector<MimeHeader> m_headers;
-- string m_md5digest;
-- NewPtr<md5_state_s> m_md5state;
-+ md5_digest_t m_md5digest;
-+ NewPtr<MD5_CTX> m_md5state;
- };
---- MimeMessageReader.cc Thu Sep 19 12:15:38 2002
-+++ MimeMessageReader.cc Wed Sep 25 22:56:17 2002
-@@ -30,4 +30,5 @@
-
--#include <cstdio>
--#include "md5.h"
-+#include <sys/types.h>
-+#include <md5.h>
-+#define MD5_DIGEST_LENGTH 16
- #include "util.h"
-@@ -93,4 +92,4 @@
-
-- m_md5state.set(new md5_state_s);
-- md5_init(m_md5state.get());
-+ m_md5state.set(new MD5_CTX);
-+ MD5Init(m_md5state.get());
-
-@@ -140,3 +139,3 @@
- }
-- md5_append(m_md5state.get(), (md5_byte_t *)value.data(), value.length());
-+ MD5Update(m_md5state.get(), (const unsigned char *)value.data(), value.length());
- }
-@@ -228,3 +227,3 @@
-
--const string &MimeMessageReader::getMD5Digest()
-+const md5_digest_t &MimeMessageReader::getMD5Digest()
- {
-@@ -236,11 +235,10 @@
-
-- m_md5digest.erase();
--
-- md5_byte_t raw_digest[32];
-- char hexcode[8];
-- md5_finish(m_md5state.get(), raw_digest);
-- for (int i = 0; i < 16; ++i) {
-- sprintf(hexcode, "%02x", (unsigned)raw_digest[i]);
-- m_md5digest += hexcode;
-+ MD5Final(m_md5digest + MD5_DIGEST_LENGTH + 1, m_md5state.get());
-+ for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
-+ char hexdigits[] = "0123456789abcdef";
-+ m_md5digest[i*2] = hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] >> 4];
-+ m_md5digest[i*2 + 1] =
-+ hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] & 0x0f];
- }
-+ m_md5digest[MD5_DIGEST_LENGTH*2 + 1] = '\0';
- m_md5state.clear();
---- MessageFactory.cc Tue Sep 17 17:39:36 2002
-+++ MessageFactory.cc Tue Oct 8 18:59:07 2002
-@@ -127,3 +127,3 @@
-
-- msg.setDigest(reader.getMD5Digest());
-+ msg.setDigest((char *)reader.getMD5Digest());
-
diff --git a/mail/spamprobe/files/spamprobe.1 b/mail/spamprobe/files/spamprobe.1
deleted file mode 100644
index 18a1884d41d7..000000000000
--- a/mail/spamprobe/files/spamprobe.1
+++ /dev/null
@@ -1,321 +0,0 @@
-.\"
-.\" $Id$
-.\"
-.\" Note: The date here should be updated whenever a non-trivial
-.\" change is made to the manual page.
-.Dd September 5, 2002
-.Dt SPAMPROBE 1
-.Os
-.Sh NAME
-.Nm spamprobe
-.Nd "Spam detector using Bayesian analysis of word counts."
-.Sh SYNOPSIS
-.Nm
-.Op Fl a Ar char
-.Op Fl c
-.Op Fl d Ar directory
-.Op Fl h
-.Op Fl H Ar option
-.Op Fl m
-.Op Fl n Ar number
-.Op Fl r Ar number
-.Op Fl s Ar number
-.Op Fl v
-.Op Fl V
-.Op Fl Y
-.Op Fl 7
-.Op Fl 8
-.Ar command Op ...
-.Nm
-.Ar receive Op filename ...
-.Nm
-.Ar score Op filename ...
-.Nm
-.Ar find-spam Op filename ...
-.Nm
-.Ar find-good Op filename ...
-.Nm
-.Ar good Op filename ...
-.Nm
-.Ar spam Op filename ...
-.Nm
-.Ar remove Op filename ...
-.Nm
-.Ar dump
-.Nm
-.Ar export
-.Nm
-.Ar import Op filename ...
-.Sh DESCRIPTION
-Welcome to
-.Nm SpamProbe !
-Are you tired of the constant bombardment of your inbox by unwanted
-email pushing everything from porn to get rich quick schemes? Have you
-tried other spam filters but become disenchanted with them when you
-realized that their manually generated rule sets weren't updated fast
-enough to keep up with spammers wording changes? Or that they generated
-unwanted false positive scores?
-.Pp
-.Nm SpamProbe
-operates on a different basis entirely. Instead of using pattern matching
-and a set of human generated rules
-.Nm SpamProbe
-relies on a Bayesian analysis
-of the frequency of words used in spam and non-spam emails received by an
-individual person. The process is completely automatic and tailors itself
-to the kinds of emails that each person receives.
-.Ss FEATURES
-.Bl -bullet -offset indent -compact
-.It
-Spam detection using Bayesian analysis of terms contained in each email.
-Words used often in spams but not in good email tend to indicate that a
-message is spam.
-.It
-Written in C++ for good performance. Database access using GDBM for quick
-startup and fast term count retrieval.
-.It
-Recognition and decoding of MIME attachments in quoted-printable and
-base64 encoding. Automatically skips non-text attachments.
-.It
-Counts two word phrases as well as single words for higher precision.
-.It
-Ignores HTML tags in emails for scoring purposes unless the -h command
-line option is used. Many spams use HTML and few humans do so HTML tends
-to become a powerful recognizer of spams. However in the author's opinion
-this also substantially increases the likelihood of false positives if
-someone does send a non-spam email containing HTML tags.
-.Nm SpamProbe
-does pull urls from inside of html tags however since those tend to be
-spammer specific.
-.It
-Locks mboxes and databases using fcntl file locking to avoid problems when
-multiple emails arrive simultaneously.
-.It
-Scores only the Received, Subject, To, From, and Cc headers. All other
-headers are ignored to make it hard for spammers to hide non-spammy words
-in X- headers to fool the filter. The
-.Fl H
-command line option can be used to override this.
-.El
-.Ss OPTIONS
-.Bl -tag -width ".Fl d Ar directory"
-.It Fl a Ar char
-By default
-.Nm
-converts non-ascii characters (characters with the most significant bit
-set to 1) into the letter 'z'. This is useful for lumping all Asian
-characters into a single word for easy recognition. The
-.Fl a
-option allows you to change the character to something else if you don't
-like the letter 'z' for some reason.
-.It Fl c
-Create the database directory if it does not already exist. Normally
-.Nm
-exits with a usage error if the database directory does not already exist.
-.It Fl d Ar directory
-By default
-.Nm
-stores its database in a directory named .spamprobe under your home
-directory. The
-.Fl d
-option allows you to specify a different directory to use. This is
-necessary if your home directory is NFS mounted for example.
-.It Fl h
-By default
-.Nm
-removes HTML markup from the text in emails to help avoid false positives.
-The
-.Fl h
-option allows you to override this behavior and force
-.Nm
-to include words from within HTML tags in its word counts. Note that
-.Nm
-always counts any URLs in hrefs within tags whether
-.Fl h
-is used or not. Use of this option is discouraged. It can increase the
-rate of spam detection slightly but unless the user receives a significant
-amount of HTML emails it also tends to increase the number of false
-positives.
-.It Fl H Ar option
-By default
-.Nm
-only scans a meaningful subset of headers from the email message when
-searching for words to score. The
-.Fl H
-option allows the user to specify additional headers to scan. Legal values
-are "all", "nox", or "normal". "all" scans all headers, "nox" scans all
-headers except those starting with X-, and "normal" scans the normal set
-of headers.
-.It Fl m
-Use mbox format for reading emails in receive mode. Normally
-.Nm
-assumes that the input to receive mode contains a single message so it
-doesn't look for message breaks.
-.It Fl n Ar number
-Changes the number of most significant words/phrases used by
-.Nm
-to calculate the score for each message. Generally this is changed only
-for optimization purposes.
-.It Fl r Ar number
-Changes the number of times that a single word/phrase can occurr in the
-top words array used to calculate the score for each message. Allowing
-repeats reduces the number of words overall (since a single word occupies
-more than one slot) but allows words which occur frequently in the message
-to have a higher weight. Generally this is changed only for optimization
-purposes.
-.It Fl s Ar number
-.Nm
-maintains an in memory cache of the words it has seen in previous messages
-to reduce disk i/o and improve performance. By default the cache is
-flushed and cleared every 250 messages. This number can be changed using
-the
-.Fl s
-option. A value of zero causes
-.NM
-to use 100,000 as the limit which effectively means that the cache will
-only be flushed at program exit (unless you have really enormous mailbox
-files). The cache doesn't affect receive, dump, or export but has a
-significant impact on the others.
-.It Fl v
-Write debugging information to stderr. This can be useful for debugging
-or for seeing which terms
-.Nm
-used to score each email.
-.It Fl V
-Prints version and copyright information and then exits.
-.It Fl Y
-Assume traditional Berkeley mailbox format, ignoring any Content-Length:
-fields.
-.It Fl 7
-Ignore any characters with the most significant bit set to 1 instead of
-mapping them to the letter 'z'.
-.It Fl 8
-Store all characters even if their most significant bit is set to 1.
-.El
-.Pp
-.Ss COMMANDS
-.Bl -tag -width ".Ar find-spam Op filename ..."
-.It Ar receive Op filename ...
-Tells
-.Nm
-to read its standard input (or a file specified after the receive command)
-and score it using the current databases. Once the message has been
-scored the message is classified as either spam or non-spam and its word
-counts are written to the appropriate database. The message's score is
-written to stdout along with a single word. For example:
-.Pp
-.Dl "SPAM 0.99"
-.Pp
-or
-.Pp
-.Dl "GOOD 0.02"
-.It Ar score Op filename ...
-Similar to receive except that the databases are not modified in any way
-and only the score is printed to stdout.
-.It Ar find-spam Op filename ...
-Similar to score except that it prints a short summary and score for each
-message that is determined to be spam. This can be useful when testing.
-.It Ar find-good Op filename ...
-Similar to score except that it prints a short summary and score for each
-message that is determined to be good. This can be useful when testing.
-.It Ar good Op filename ...
-Scans each file (or stdin if no file is specified) and reclassifies every
-email in the file as non-spam. The databases are updated appropriately.
-Previously processed messages (recognized using their message ids) are
-ignored.
-.It Ar spam Op filename ...
-Scans each file (or stdin if no file is specified) and reclassifies every
-email in the file as spam. The databases are updated appropriately.
-Previously processed messages (recognized using their message ids) are
-ignored.
-.It Ar remove Op filename ...
-Scans each file (or stdin if no file is specified) and removes its term
-counts from the database. Messages which are not in the database
-(recognized using their message ids) are ignored.
-.It Ar dump
-Prints the contents of the word counts database one word per line in human
-readable format with good count, spam count, and word in columns separated
-by whitespace. Note that when using GDBM for the database the words are
-printed in the order they are hashed so the results will need to be sorted
-to be most useful. The standard unix sort command can do this. For
-example to list all words from "most good" to "least good" use this
-command:
-.Pp
-.Dl "spamprobe dump | sort -k 1 -n -r"
-.Pp
-To list all words from "most spammy" to "least spammy" use this command:
-.Pp
-.Dl "spamprobe dump | sort -k 2 -n -r"
-.It Ar export
-Similar to the dump command but prints the counts and words in a comma
-separated format with the words surrounded by double quotes. This can be
-more useful for importing into some databases.
-.It Ar import Op filename ...
-Reads the specified files which must contain export data written by the
-export command. The terms and counts from this file are added to the
-database. This can be used to convert a database from a prior version.
-.El
-.Sh ENVIRONMENT
-The
-.Nm
-command looks for the database directory in the users home directory
-specified by the
-.Ev HOME
-environment variable. Use the
-.Fl d
-flag to specify a different database directory.
-.Sh FILES
-.Bl -tag -width ".Pa $HOME/. Ns Nm" -compact
-.It Pa $HOME/. Ns Nm
-The default database directory.
-.El
-.Sh EXAMPLES
-Typically one would use
-.Nm
-with
-.Nm procmail
-and
-.Nm formail
-to flag and filter incoming email.
-.Pp
-.Dl "# SpamProbe rule."
-.Dl ":0"
-.Dl "{"
-.Dl " # Generate a score for the message."
-.Dl " SCORE=`spamprobe receive`"
-.Dl " # Add a X-SpamProbe header to the message."
-.Dl " :0 fhW"
-.Dl " | formail -I ""X-SpamProbe: $SCORE"""
-.Dl "}"
-.Pp
-.Dl "# Filter matching messages to their own mailbox."
-.Dl ":0:"
-.Dl "*^X-SpamProbe: SPAM"
-.Dl "spamprobe"
-.Sh DIAGNOSTICS
-Exit status is 0 on success, and 1 if
-.Nm
-encounters an invalid command.
-.Sh COMPATIBILITY
-Version of
-.Nm
-previous to 0.7 use a different database format. To convert your existing
-database to the new format use the following command.
-.Pp
-.Dl "spamprobe-export_0.6 | spamprobe import"
-.Sh SEE ALSO
-.Xr formail 1 ,
-.Xr procmail 1 ,
-.Rs
-.%A "Paul Graham"
-.%T "A Plan for Spam"
-.%O http://www.paulgraham.com/spam.html
-.%D "August 2002"
-.Re
-.Sh AUTHORS
-This
-manual page was written by
-.An Matthew N. Dodd Aq mdodd@FreeBSD.org .
-.Nm
-was written by
-.An Brian Burton Aq bburton@users.sourceforge.net
diff --git a/mail/spamprobe/files/post-install-notes b/mail/spamprobe/pkg-message
index c115a23a4bc5..c115a23a4bc5 100644
--- a/mail/spamprobe/files/post-install-notes
+++ b/mail/spamprobe/pkg-message