[prev in list] [next in list] [prev in thread] [next in thread] 

List:       spambayes-checkins
Subject:    [Spambayes-checkins] spambayes/contrib pycksum.py,1.1,1.2
From:       montanaro () users ! sourceforge ! net (Skip Montanaro)
Date:       2006-08-18 2:29:05
Message-ID: 20060818022907.D10021E4004 () bag ! python ! org
[Download RAW message or body]

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv16513

Modified Files:
	pycksum.py 
Log Message:
* Try to improve the duplicate detection capability.  Lots of spam nowadays
  has random text junk, so be more lenient about how many chunks have to
  match.  Also do a little more filtering on the source:

  - Compress multiple spaces and tabs to a single space
  - Compress multiple contiguous newlines into one
  - Map all strings of digits to a single "#" character
  - Map some common html entities to their plain text equivalents.

* Use md5 checksum hexdigests instead of binascii.b2a_hex.

* Correct line breaking of filtered body.

* Use email.generator to flatten body instead of the broken flatten()
  function.


Index: pycksum.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/pycksum.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** pycksum.py	25 May 2004 14:58:39 -0000	1.1
--- pycksum.py	18 Aug 2006 02:29:02 -0000	1.2
***************
*** 39,60 ****
  import sys
  import email.Parser
  import md5
  import anydbm
  import re
  import time
! import binascii
! 
! def flatten(body):
!     # three types are possible: list, string, Message
!     if isinstance(body, str):
!         return body
!     if hasattr(body, "get_payload"):
!         payload = body.get_payload()
!         if payload is None:
!             return ""
!         return flatten(payload)
!     if isinstance(body, list):
!         return "\n".join([flatten(b) for b in body])
!     raise TypeError, ("unrecognized body type: %s" % type(body))
  
  def clean(data):
--- 39,51 ----
  import sys
  import email.Parser
+ import email.generator
  import md5
  import anydbm
  import re
  import time
! try:
!     import cStringIO as StringIO
! except ImportError:
!     import StringIO
  
  def clean(data):
***************
*** 67,74 ****
      data = re.sub(r"<[^>]*>", "", data).lower()
  
      # delete anything which looks like a url or email address
      # not sure what a pmguid: url is but it seems to occur frequently in spam
      # also convert all runs of whitespace into a single space
!     return " ".join([w for w in data.split()
                       if ('@' not in w and
                           (':' not in w or
--- 58,78 ----
      data = re.sub(r"<[^>]*>", "", data).lower()
  
+     # Map all digits to '#'
+     data = re.sub(r"[0-9]+", "#", data)
+ 
+     # Map a few common html entities
+     data = re.sub(r"(&nbsp;)+", " ", data)
+     data = re.sub(r"&lt;", "<", data)
+     data = re.sub(r"&gt;", ">", data)
+     data = re.sub(r"&amp;", "&", data)
+ 
+     # Elide blank lines and multiple horizontal whitespace
+     data = re.sub(r"\n+", "\n", data)
+     data = re.sub(r"[ \t]+", " ", data)
+ 
      # delete anything which looks like a url or email address
      # not sure what a pmguid: url is but it seems to occur frequently in spam
      # also convert all runs of whitespace into a single space
!     return " ".join([w for w in data.split(" ")
                       if ('@' not in w and
                           (':' not in w or
***************
*** 87,97 ****
      # separately or in various combinations if desired.
  
!     body = flatten(msg)
!     lines = clean(body)
      chunksize = len(lines)//4+1
      sum = []
      for i in range(4):
          chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
!         sum.append(binascii.b2a_hex(md5.new(chunk).digest()))
  
      return ".".join(sum)
--- 91,105 ----
      # separately or in various combinations if desired.
  
!     fp = StringIO.StringIO()
!     g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60)
!     g.flatten(msg)
!     text = fp.getvalue()
!     body = text.split("\n\n", 1)[1]
!     lines = clean(body).split("\n")
      chunksize = len(lines)//4+1
      sum = []
      for i in range(4):
          chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
!         sum.append(md5.new(chunk).hexdigest())
  
      return ".".join(sum)
***************
*** 102,111 ****
      db = anydbm.open(f, "c")
      maxdblen = 2**14
!     # consider the first three pieces, the last three pieces and the middle
!     # two pieces - one or more will likely eliminate attempts at disrupting
!     # the checksum - if any are found in the db file, call it a match
!     for subsum in (".".join(pieces[:-1]),
                     ".".join(pieces[1:-1]),
!                    ".".join(pieces[1:])):
          if not db.has_key(subsum):
              db[subsum] = str(time.time())
--- 110,119 ----
      db = anydbm.open(f, "c")
      maxdblen = 2**14
!     # consider the first two pieces, the middle two pieces and the last two
!     # pieces - one or more will likely eliminate attempts at disrupting the
!     # checksum - if any are found in the db file, call it a match
!     for subsum in (".".join(pieces[:-2]),
                     ".".join(pieces[1:-1]),
!                    ".".join(pieces[2:])):
          if not db.has_key(subsum):
              db[subsum] = str(time.time())
***************
*** 155,157 ****
  if __name__ == "__main__":
      sys.exit(main(sys.argv[1:]))
- 
--- 163,164 ----


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic