diff options
author | Matthew N. Dodd <mdodd@FreeBSD.org> | 2003-04-29 00:18:11 +0000 |
---|---|---|
committer | Matthew N. Dodd <mdodd@FreeBSD.org> | 2003-04-29 00:18:11 +0000 |
commit | b9b33f4d05c03795553e399fa0a580f2123f33c1 (patch) | |
tree | 601c23545b77d93974686fcad79999bf55cc8954 /mail/spamprobe | |
parent | Update to version 0.20. (diff) |
Update to 0.8b
Notes
Notes:
svn path=/head/; revision=79817
Diffstat (limited to 'mail/spamprobe')
-rw-r--r-- | mail/spamprobe/Makefile | 11 | ||||
-rw-r--r-- | mail/spamprobe/distinfo | 2 | ||||
-rw-r--r-- | mail/spamprobe/files/Makefile.export0_6 | 4 | ||||
-rw-r--r-- | mail/spamprobe/files/patch-MessageFactory.cc | 34 | ||||
-rw-r--r-- | mail/spamprobe/files/patch-md5 | 76 | ||||
-rw-r--r-- | mail/spamprobe/files/spamprobe.1 | 321 | ||||
-rw-r--r-- | mail/spamprobe/pkg-message (renamed from mail/spamprobe/files/post-install-notes) | 0 |
7 files changed, 41 insertions, 407 deletions
diff --git a/mail/spamprobe/Makefile b/mail/spamprobe/Makefile index 8b6b72c3dd91..8d5bba628453 100644 --- a/mail/spamprobe/Makefile +++ b/mail/spamprobe/Makefile @@ -6,16 +6,16 @@ # PORTNAME= spamprobe -PORTVERSION= 0.7c +PORTVERSION= 0.8b CATEGORIES= mail MASTER_SITES= ${MASTER_SITE_SOURCEFORGE} MASTER_SITE_SUBDIR=${PORTNAME} -LIB_DEPENDS= db3.3:${PORTSDIR}/databases/db3 - MAINTAINER= mdodd@freebsd.org COMMENT= Spam detector using Bayesian analysis of word counts +LIB_DEPENDS= db4:${PORTSDIR}/databases/db4 + MAKEFILE= ${FILESDIR}/Makefile MAKE_ENV+= FILESDIR="${FILESDIR}" @@ -25,12 +25,9 @@ post-build: @cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \ ${FILESDIR}/Makefile.export0_6 clean all -post-extract: - @${RM} -rf ${WRKSRC}/thirdparty - post-install: @cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \ ${FILESDIR}/Makefile.export0_6 install - @${CAT} ${FILESDIR}/post-install-notes + @${CAT} ${PKGMESSAGE} .include <bsd.port.post.mk> diff --git a/mail/spamprobe/distinfo b/mail/spamprobe/distinfo index 92ea4bc7d06f..dd9880fe4eb2 100644 --- a/mail/spamprobe/distinfo +++ b/mail/spamprobe/distinfo @@ -1 +1 @@ -MD5 (spamprobe-0.7c.tar.gz) = 51e568a3bd908ca629537bb0f9acde8c +MD5 (spamprobe-0.8b.tar.gz) = a5ddc25dd2d116f3e6f346b027ae034f diff --git a/mail/spamprobe/files/Makefile.export0_6 b/mail/spamprobe/files/Makefile.export0_6 index bbd5cfd126d4..4b025b2aaf53 100644 --- a/mail/spamprobe/files/Makefile.export0_6 +++ b/mail/spamprobe/files/Makefile.export0_6 @@ -1,10 +1,10 @@ -# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.2 2002-10-08 23:48:39 mi Exp $ +# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.3 2003-04-29 00:18:11 mdodd Exp $ # PREFIX?= /usr/local BINDIR= ${PREFIX}/bin NOMAN= PROG_CXX= spamprobe-export_0.6 -CXXFLAGS+= -Wall -DUSE_DBM +CXXFLAGS+= -Wall -DUSE_DBM -DNDEBUG SRCS= File.cc export0_6.cc .include <bsd.prog.mk> diff --git a/mail/spamprobe/files/patch-MessageFactory.cc b/mail/spamprobe/files/patch-MessageFactory.cc new file mode 100644 index 000000000000..941ab4596ac8 --- /dev/null +++ b/mail/spamprobe/files/patch-MessageFactory.cc @@ -0,0 +1,34 @@ +--- MessageFactory.cc.orig Tue Mar 11 07:38:41 2003 ++++ MessageFactory.cc Tue Mar 11 07:51:38 2003 +@@ -28,7 +28,7 @@ + // http://www.cooldevtools.com/qpl.html + // + +-#include <strstream> ++#include <sstream> + #include "Tokenizer.h" + #include "MessageFactory.h" + #include "RegularExpression.h" +@@ -50,11 +50,11 @@ + MessageFactory::MessageFactory() + : m_minWordLength(2), + m_maxWordLength(90), ++ m_phraser(new PhraseBuilder(2)), + m_replaceNonAsciiChars(true), + m_nonAsciiChar('z'), + m_removeHTML(true), +- m_headersToInclude(NORMAL_HEADERS), +- m_phraser(new PhraseBuilder(2)) ++ m_headersToInclude(NORMAL_HEADERS) + { + } + +@@ -299,7 +299,7 @@ + text += ' '; + } else if (entity[0] == '#') { + int code = 0; +- istrstream in(entity.c_str() + 1); ++ istringstream in(entity.c_str() + 1); + in >> code; + text += (char)code; + } else { diff --git a/mail/spamprobe/files/patch-md5 b/mail/spamprobe/files/patch-md5 deleted file mode 100644 index 8ae1e7847c8b..000000000000 --- a/mail/spamprobe/files/patch-md5 +++ /dev/null @@ -1,76 +0,0 @@ ---- MimeMessageReader.h Thu Sep 19 12:15:38 2002 -+++ MimeMessageReader.h Wed Sep 25 09:19:55 2002 -@@ -34,4 +34,7 @@ - #include "MimeHeader.h" -+#include <sys/types.h> -+#include <md5.h> -+#define MD5_DIGEST_LENGTH 16 - --class md5_state_s; -+typedef unsigned char md5_digest_t[MD5_DIGEST_LENGTH*2 + 1]; - -@@ -64,3 +65,3 @@ - -- const string &getMD5Digest(); -+ const md5_digest_t &getMD5Digest(); - -@@ -105,4 +106,4 @@ - vector<MimeHeader> m_headers; -- string m_md5digest; -- NewPtr<md5_state_s> m_md5state; -+ md5_digest_t m_md5digest; -+ NewPtr<MD5_CTX> m_md5state; - }; ---- MimeMessageReader.cc Thu Sep 19 12:15:38 2002 -+++ MimeMessageReader.cc Wed Sep 25 22:56:17 2002 -@@ -30,4 +30,5 @@ - --#include <cstdio> --#include "md5.h" -+#include <sys/types.h> -+#include <md5.h> -+#define MD5_DIGEST_LENGTH 16 - #include "util.h" -@@ -93,4 +92,4 @@ - -- m_md5state.set(new md5_state_s); -- md5_init(m_md5state.get()); -+ m_md5state.set(new MD5_CTX); -+ MD5Init(m_md5state.get()); - -@@ -140,3 +139,3 @@ - } -- md5_append(m_md5state.get(), (md5_byte_t *)value.data(), value.length()); -+ MD5Update(m_md5state.get(), (const unsigned char *)value.data(), value.length()); - } -@@ -228,3 +227,3 @@ - --const string &MimeMessageReader::getMD5Digest() -+const md5_digest_t &MimeMessageReader::getMD5Digest() - { -@@ -236,11 +235,10 @@ - -- m_md5digest.erase(); -- -- md5_byte_t raw_digest[32]; -- char hexcode[8]; -- md5_finish(m_md5state.get(), raw_digest); -- for (int i = 0; i < 16; ++i) { -- sprintf(hexcode, "%02x", (unsigned)raw_digest[i]); -- m_md5digest += hexcode; -+ MD5Final(m_md5digest + MD5_DIGEST_LENGTH + 1, m_md5state.get()); -+ for (int i = 0; i < MD5_DIGEST_LENGTH; i++) { -+ char hexdigits[] = "0123456789abcdef"; -+ m_md5digest[i*2] = hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] >> 4]; -+ m_md5digest[i*2 + 1] = -+ hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] & 0x0f]; - } -+ m_md5digest[MD5_DIGEST_LENGTH*2 + 1] = '\0'; - m_md5state.clear(); ---- MessageFactory.cc Tue Sep 17 17:39:36 2002 -+++ MessageFactory.cc Tue Oct 8 18:59:07 2002 -@@ -127,3 +127,3 @@ - -- msg.setDigest(reader.getMD5Digest()); -+ msg.setDigest((char *)reader.getMD5Digest()); - diff --git a/mail/spamprobe/files/spamprobe.1 b/mail/spamprobe/files/spamprobe.1 deleted file mode 100644 index 18a1884d41d7..000000000000 --- a/mail/spamprobe/files/spamprobe.1 +++ /dev/null @@ -1,321 +0,0 @@ -.\" -.\" $Id$ -.\" -.\" Note: The date here should be updated whenever a non-trivial -.\" change is made to the manual page. -.Dd September 5, 2002 -.Dt SPAMPROBE 1 -.Os -.Sh NAME -.Nm spamprobe -.Nd "Spam detector using Bayesian analysis of word counts." -.Sh SYNOPSIS -.Nm -.Op Fl a Ar char -.Op Fl c -.Op Fl d Ar directory -.Op Fl h -.Op Fl H Ar option -.Op Fl m -.Op Fl n Ar number -.Op Fl r Ar number -.Op Fl s Ar number -.Op Fl v -.Op Fl V -.Op Fl Y -.Op Fl 7 -.Op Fl 8 -.Ar command Op ... -.Nm -.Ar receive Op filename ... -.Nm -.Ar score Op filename ... -.Nm -.Ar find-spam Op filename ... -.Nm -.Ar find-good Op filename ... -.Nm -.Ar good Op filename ... -.Nm -.Ar spam Op filename ... -.Nm -.Ar remove Op filename ... -.Nm -.Ar dump -.Nm -.Ar export -.Nm -.Ar import Op filename ... -.Sh DESCRIPTION -Welcome to -.Nm SpamProbe ! -Are you tired of the constant bombardment of your inbox by unwanted -email pushing everything from porn to get rich quick schemes? Have you -tried other spam filters but become disenchanted with them when you -realized that their manually generated rule sets weren't updated fast -enough to keep up with spammers wording changes? Or that they generated -unwanted false positive scores? -.Pp -.Nm SpamProbe -operates on a different basis entirely. Instead of using pattern matching -and a set of human generated rules -.Nm SpamProbe -relies on a Bayesian analysis -of the frequency of words used in spam and non-spam emails received by an -individual person. The process is completely automatic and tailors itself -to the kinds of emails that each person receives. -.Ss FEATURES -.Bl -bullet -offset indent -compact -.It -Spam detection using Bayesian analysis of terms contained in each email. -Words used often in spams but not in good email tend to indicate that a -message is spam. -.It -Written in C++ for good performance. Database access using GDBM for quick -startup and fast term count retrieval. -.It -Recognition and decoding of MIME attachments in quoted-printable and -base64 encoding. Automatically skips non-text attachments. -.It -Counts two word phrases as well as single words for higher precision. -.It -Ignores HTML tags in emails for scoring purposes unless the -h command -line option is used. Many spams use HTML and few humans do so HTML tends -to become a powerful recognizer of spams. However in the author's opinion -this also substantially increases the likelihood of false positives if -someone does send a non-spam email containing HTML tags. -.Nm SpamProbe -does pull urls from inside of html tags however since those tend to be -spammer specific. -.It -Locks mboxes and databases using fcntl file locking to avoid problems when -multiple emails arrive simultaneously. -.It -Scores only the Received, Subject, To, From, and Cc headers. All other -headers are ignored to make it hard for spammers to hide non-spammy words -in X- headers to fool the filter. The -.Fl H -command line option can be used to override this. -.El -.Ss OPTIONS -.Bl -tag -width ".Fl d Ar directory" -.It Fl a Ar char -By default -.Nm -converts non-ascii characters (characters with the most significant bit -set to 1) into the letter 'z'. This is useful for lumping all Asian -characters into a single word for easy recognition. The -.Fl a -option allows you to change the character to something else if you don't -like the letter 'z' for some reason. -.It Fl c -Create the database directory if it does not already exist. Normally -.Nm -exits with a usage error if the database directory does not already exist. -.It Fl d Ar directory -By default -.Nm -stores its database in a directory named .spamprobe under your home -directory. The -.Fl d -option allows you to specify a different directory to use. This is -necessary if your home directory is NFS mounted for example. -.It Fl h -By default -.Nm -removes HTML markup from the text in emails to help avoid false positives. -The -.Fl h -option allows you to override this behavior and force -.Nm -to include words from within HTML tags in its word counts. Note that -.Nm -always counts any URLs in hrefs within tags whether -.Fl h -is used or not. Use of this option is discouraged. It can increase the -rate of spam detection slightly but unless the user receives a significant -amount of HTML emails it also tends to increase the number of false -positives. -.It Fl H Ar option -By default -.Nm -only scans a meaningful subset of headers from the email message when -searching for words to score. The -.Fl H -option allows the user to specify additional headers to scan. Legal values -are "all", "nox", or "normal". "all" scans all headers, "nox" scans all -headers except those starting with X-, and "normal" scans the normal set -of headers. -.It Fl m -Use mbox format for reading emails in receive mode. Normally -.Nm -assumes that the input to receive mode contains a single message so it -doesn't look for message breaks. -.It Fl n Ar number -Changes the number of most significant words/phrases used by -.Nm -to calculate the score for each message. Generally this is changed only -for optimization purposes. -.It Fl r Ar number -Changes the number of times that a single word/phrase can occurr in the -top words array used to calculate the score for each message. Allowing -repeats reduces the number of words overall (since a single word occupies -more than one slot) but allows words which occur frequently in the message -to have a higher weight. Generally this is changed only for optimization -purposes. -.It Fl s Ar number -.Nm -maintains an in memory cache of the words it has seen in previous messages -to reduce disk i/o and improve performance. By default the cache is -flushed and cleared every 250 messages. This number can be changed using -the -.Fl s -option. A value of zero causes -.NM -to use 100,000 as the limit which effectively means that the cache will -only be flushed at program exit (unless you have really enormous mailbox -files). The cache doesn't affect receive, dump, or export but has a -significant impact on the others. -.It Fl v -Write debugging information to stderr. This can be useful for debugging -or for seeing which terms -.Nm -used to score each email. -.It Fl V -Prints version and copyright information and then exits. -.It Fl Y -Assume traditional Berkeley mailbox format, ignoring any Content-Length: -fields. -.It Fl 7 -Ignore any characters with the most significant bit set to 1 instead of -mapping them to the letter 'z'. -.It Fl 8 -Store all characters even if their most significant bit is set to 1. -.El -.Pp -.Ss COMMANDS -.Bl -tag -width ".Ar find-spam Op filename ..." -.It Ar receive Op filename ... -Tells -.Nm -to read its standard input (or a file specified after the receive command) -and score it using the current databases. Once the message has been -scored the message is classified as either spam or non-spam and its word -counts are written to the appropriate database. The message's score is -written to stdout along with a single word. For example: -.Pp -.Dl "SPAM 0.99" -.Pp -or -.Pp -.Dl "GOOD 0.02" -.It Ar score Op filename ... -Similar to receive except that the databases are not modified in any way -and only the score is printed to stdout. -.It Ar find-spam Op filename ... -Similar to score except that it prints a short summary and score for each -message that is determined to be spam. This can be useful when testing. -.It Ar find-good Op filename ... -Similar to score except that it prints a short summary and score for each -message that is determined to be good. This can be useful when testing. -.It Ar good Op filename ... -Scans each file (or stdin if no file is specified) and reclassifies every -email in the file as non-spam. The databases are updated appropriately. -Previously processed messages (recognized using their message ids) are -ignored. -.It Ar spam Op filename ... -Scans each file (or stdin if no file is specified) and reclassifies every -email in the file as spam. The databases are updated appropriately. -Previously processed messages (recognized using their message ids) are -ignored. -.It Ar remove Op filename ... -Scans each file (or stdin if no file is specified) and removes its term -counts from the database. Messages which are not in the database -(recognized using their message ids) are ignored. -.It Ar dump -Prints the contents of the word counts database one word per line in human -readable format with good count, spam count, and word in columns separated -by whitespace. Note that when using GDBM for the database the words are -printed in the order they are hashed so the results will need to be sorted -to be most useful. The standard unix sort command can do this. For -example to list all words from "most good" to "least good" use this -command: -.Pp -.Dl "spamprobe dump | sort -k 1 -n -r" -.Pp -To list all words from "most spammy" to "least spammy" use this command: -.Pp -.Dl "spamprobe dump | sort -k 2 -n -r" -.It Ar export -Similar to the dump command but prints the counts and words in a comma -separated format with the words surrounded by double quotes. This can be -more useful for importing into some databases. -.It Ar import Op filename ... -Reads the specified files which must contain export data written by the -export command. The terms and counts from this file are added to the -database. This can be used to convert a database from a prior version. -.El -.Sh ENVIRONMENT -The -.Nm -command looks for the database directory in the users home directory -specified by the -.Ev HOME -environment variable. Use the -.Fl d -flag to specify a different database directory. -.Sh FILES -.Bl -tag -width ".Pa $HOME/. Ns Nm" -compact -.It Pa $HOME/. Ns Nm -The default database directory. -.El -.Sh EXAMPLES -Typically one would use -.Nm -with -.Nm procmail -and -.Nm formail -to flag and filter incoming email. -.Pp -.Dl "# SpamProbe rule." -.Dl ":0" -.Dl "{" -.Dl " # Generate a score for the message." -.Dl " SCORE=`spamprobe receive`" -.Dl " # Add a X-SpamProbe header to the message." -.Dl " :0 fhW" -.Dl " | formail -I ""X-SpamProbe: $SCORE""" -.Dl "}" -.Pp -.Dl "# Filter matching messages to their own mailbox." -.Dl ":0:" -.Dl "*^X-SpamProbe: SPAM" -.Dl "spamprobe" -.Sh DIAGNOSTICS -Exit status is 0 on success, and 1 if -.Nm -encounters an invalid command. -.Sh COMPATIBILITY -Version of -.Nm -previous to 0.7 use a different database format. To convert your existing -database to the new format use the following command. -.Pp -.Dl "spamprobe-export_0.6 | spamprobe import" -.Sh SEE ALSO -.Xr formail 1 , -.Xr procmail 1 , -.Rs -.%A "Paul Graham" -.%T "A Plan for Spam" -.%O http://www.paulgraham.com/spam.html -.%D "August 2002" -.Re -.Sh AUTHORS -This -manual page was written by -.An Matthew N. Dodd Aq mdodd@FreeBSD.org . -.Nm -was written by -.An Brian Burton Aq bburton@users.sourceforge.net diff --git a/mail/spamprobe/files/post-install-notes b/mail/spamprobe/pkg-message index c115a23a4bc5..c115a23a4bc5 100644 --- a/mail/spamprobe/files/post-install-notes +++ b/mail/spamprobe/pkg-message |