[prev in list] [next in list] [prev in thread] [next in thread] 

List:       mailman-cvs
Subject:    [Mailman-checkins] CVS: mailman/Mailman/Archiver HyperDatabase.py,1.8,1.9 pipermail.py,1.10,1.11
From:       Jeremy Hylton <jhylton () users ! sourceforge ! net>
Date:       2000-09-22 18:23:40
[Download RAW message or body]

Update of /cvsroot/mailman/mailman/Mailman/Archiver
In directory slayer.i.sourceforge.net:/tmp/cvs-serv11650

Modified Files:
	HyperDatabase.py pipermail.py 
Log Message:
Fix index generation bug that oocasionally prevented messages from
appearing in index.  pipermail generated several indexes by assuming
that date was unique.  If two messages arrived with, e.g., the same
author and date, then the author index treated them as identical.
As a result, both messages were archived, but only the last one was
included in the index.  Solution is to always include the msgid, which
is unique, in the index key.

Change database keys to combine elements using tuples instead of
string concatenation with \000 as separator.

Fix was accomplished by refactoring on pipermail.Database and its
subclasses.  Push index-key generation into common concrete base class
Database; rename abstract base class to DatabaseInterface.  Break up
addArticle method into several pieces.

TBD There is still more refactoring to do on Database class.

Because date key has changed, HyperDatabase method to return first and
last date changed to reflect format of date key.

Refactor pipermail.T.add_article into several pieces.



Index: HyperDatabase.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/Archiver/HyperDatabase.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** HyperDatabase.py	2000/06/23 04:30:10	1.8
--- HyperDatabase.py	2000/09/22 18:23:37	1.9
***************
*** 21,24 ****
--- 21,26 ----
  import marshal
  import string
+ import sys
+ import time
  import errno
  
***************
*** 40,44 ****
      import pickle
  
- 
  #
  # we're using a python dict in place of
--- 42,45 ----
***************
*** 48,57 ****
  #
  class DumbBTree:
!     # XXX This dictionary-like object stores pickles of all the
!     # Article objects.  The object itself is stored using marshal.  It 
!     # would be much simpler, and probably faster, to store the actual
!     # objects in the DumbBTree and pickle it.
!     # XXX Also needs a more sensible name, like IteratableDictionary
!     # or SortedDictionary.
      def __init__(self, path):
          self.current_index = 0
--- 49,63 ----
  #
  class DumbBTree:
!     """Stores pickles of Article objects
! 
!     This dictionary-like object stores pickles of all the Article
!     objects.  The object itself is stored using marshal.  It would be
!     much simpler, and probably faster, to store the actual objects in
!     the DumbBTree and pickle it.
!     
!     TBD: Also needs a more sensible name, like IteratableDictionary or
!     SortedDictionary.
!     """
!     
      def __init__(self, path):
          self.current_index = 0
***************
*** 76,79 ****
--- 82,88 ----
              self.__sort(dirty=1)
  
+     def __repr__(self):
+         return "DumbBTree(%s)" % self.path
+ 
      def __sort(self, dirty=None):
          if self.__dirty == 1 or dirty:
***************
*** 124,130 ****
          else:
              key = self.sorted[0]
-             res = key, self.dict[key]
              self.current_index = 1
! 	    return res
  
      def last(self):
--- 133,138 ----
          else:
              key = self.sorted[0]
              self.current_index = 1
! 	    return key, self.dict[key]
  
      def last(self):
***************
*** 179,183 ****
          self.unlock()
  
- 
  
  # this is lifted straight out of pipermail with
--- 187,190 ----
***************
*** 187,190 ****
--- 194,199 ----
  #
  class HyperDatabase(pipermail.Database):
+     __super_addArticle = pipermail.Database.addArticle
+     
      def __init__(self, basedir):
          self.__cache = {}
***************
*** 195,215 ****
  
      def firstdate(self, archive):
- 	import time
  	self.__openIndices(archive)
! 	date='None'
  	try:
! 	    date, msgid = self.dateIndex.first()
! 	    date=time.asctime(time.localtime(string.atof(date)))
! 	except KeyError: pass
  	return date
  
      def lastdate(self, archive):
- 	import time
  	self.__openIndices(archive)
! 	date='None'
  	try:
! 	    date, msgid = self.dateIndex.last()
! 	    date=time.asctime(time.localtime(string.atof(date)))
! 	except KeyError: pass
  	return date
  
--- 204,224 ----
  
      def firstdate(self, archive):
  	self.__openIndices(archive)
! 	date = 'None'
  	try:
! 	    datekey, msgid = self.dateIndex.first()
! 	    date = time.asctime(time.localtime(string.atof(datekey[0])))
! 	except KeyError:
!             pass
  	return date
  
      def lastdate(self, archive):
  	self.__openIndices(archive)
! 	date = 'None'
  	try:
! 	    datekey, msgid = self.dateIndex.last()
! 	    date = time.asctime(time.localtime(string.atof(datekey[0])))
! 	except KeyError:
!             pass
  	return date
  
***************
*** 218,281 ****
  	return len(self.dateIndex)    
  
!     # Add a single article to the internal indexes for an archive.
! 
!     def addArticle(self, archive, article, subjectkey, authorkey):
  	self.__openIndices(archive)
  
- 	# Add the new article
- 	self.dateIndex[article.date]=article.msgid
- 	self.authorIndex[authorkey]=article.msgid
- 	self.subjectIndex[subjectkey]=article.msgid
- 	# Set the 'body' attribute to empty, to avoid storing the whole message
- 	temp = article.body ; article.body=[]
- 	self.articleIndex[article.msgid]=pickle.dumps(article)
- 	article.body=temp
- 	self.changed[archive,article.msgid]=None
- 
- 	parentID=article.parentID
- 	if parentID!=None and self.articleIndex.has_key(parentID): 
- 	    parent=self.getArticle(archive, parentID)
- 	    myThreadKey=parent.threadKey+article.date+'-'
- 	else: myThreadKey = article.date+'-'
- 	article.threadKey=myThreadKey
- 	self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid)
- 
-     # Open the BSDDB files that are being used as indices
-     # (dateIndex, authorIndex, subjectIndex, articleIndex)
      def __openIndices(self, archive):
! 	if self.__currentOpenArchive==archive: return
  	self.__closeIndices()
! 	arcdir=os.path.join(self.basedir, 'database')
! 	try: mkdir(arcdir, mode=02770)
!         except os.error: pass
! 	for i in ['date', 'author', 'subject', 'article', 'thread']:
! 	    t=DumbBTree(os.path.join(arcdir, archive+'-'+i)) 
! 	    setattr(self, i+'Index', t)
! 	self.__currentOpenArchive=archive
  
-     # Close the BSDDB files that are being used as indices (if they're
-     # open--this is safe to call if they're already closed)
      def __closeIndices(self):
! 	if self.__currentOpenArchive!=None: 
! 	    pass
! 	for i in ['date', 'author', 'subject', 'thread', 'article']:
! 	    attr=i+'Index'
  	    if hasattr(self, attr): 
! 		index=getattr(self, attr) 
! 		if i=='article': 
  	            if not hasattr(self, 'archive_length'):
!                         self.archive_length={}
! 		    self.archive_length[self.__currentOpenArchive]=len(index)
  		index.close() 
! 		delattr(self,attr)
! 	self.__currentOpenArchive=None
      def close(self):
  	self.__closeIndices()
      def hasArticle(self, archive, msgid): 
  	self.__openIndices(archive)
  	return self.articleIndex.has_key(msgid)
      def setThreadKey(self, archive, key, msgid):
  	self.__openIndices(archive)
  	self.threadIndex[key]=msgid
      def getArticle(self, archive, msgid):
  	self.__openIndices(archive)
--- 227,274 ----
  	return len(self.dateIndex)    
  
!     def addArticle(self, archive, article, subject=None, author=None,
!                    date=None):
  	self.__openIndices(archive)
+         self.__super_addArticle(archive, article, subject, author, date)
  
      def __openIndices(self, archive):
! 	if self.__currentOpenArchive == archive:
!             return
  	self.__closeIndices()
! 	arcdir = os.path.join(self.basedir, 'database')
! 	try:
!             mkdir(arcdir, mode=02770)
!         except os.error:
!             pass
! 	for i in ('date', 'author', 'subject', 'article', 'thread'):
! 	    t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) 
! 	    setattr(self, i + 'Index', t)
! 	self.__currentOpenArchive = archive
  
      def __closeIndices(self):
! 	for i in ('date', 'author', 'subject', 'thread', 'article'):
! 	    attr = i + 'Index'
  	    if hasattr(self, attr): 
! 		index = getattr(self, attr) 
! 		if i == 'article': 
  	            if not hasattr(self, 'archive_length'):
!                         self.archive_length = {}
!                     l = len(index)
!                     self.archive_length[self.__currentOpenArchive] = l
  		index.close() 
! 		delattr(self, attr)
! 	self.__currentOpenArchive = None
!         
      def close(self):
  	self.__closeIndices()
+         
      def hasArticle(self, archive, msgid): 
  	self.__openIndices(archive)
  	return self.articleIndex.has_key(msgid)
+     
      def setThreadKey(self, archive, key, msgid):
  	self.__openIndices(archive)
  	self.threadIndex[key]=msgid
+         
      def getArticle(self, archive, msgid):
  	self.__openIndices(archive)
***************
*** 290,305 ****
      def first(self, archive, index): 
  	self.__openIndices(archive)
! 	index=getattr(self, index+'Index')
  	try: 
  	    key, msgid = index.first()
  	    return msgid
! 	except KeyError: return None
      def next(self, archive, index): 
  	self.__openIndices(archive)
! 	index=getattr(self, index+'Index')
  	try: 
  	    key, msgid = index.next()
  	    return msgid
! 	except KeyError: return None
  	
      def getOldestArticle(self, archive, subject):
--- 283,301 ----
      def first(self, archive, index): 
  	self.__openIndices(archive)
! 	index = getattr(self, index + 'Index')
  	try: 
  	    key, msgid = index.first()
  	    return msgid
! 	except KeyError:
!             return None
!         
      def next(self, archive, index): 
  	self.__openIndices(archive)
! 	index = getattr(self, index + 'Index')
  	try: 
  	    key, msgid = index.next()
  	    return msgid
! 	except KeyError:
!             return None
  	
      def getOldestArticle(self, archive, subject):
***************
*** 315,319 ****
  	    return None
  
!     def newArchive(self, archive): pass
      def clearIndex(self, archive, index):
  	self.__openIndices(archive)
--- 311,317 ----
  	    return None
  
!     def newArchive(self, archive):
!         pass
!     
      def clearIndex(self, archive, index):
  	self.__openIndices(archive)

Index: pipermail.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/Archiver/pipermail.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -r1.10 -r1.11
*** pipermail.py	2000/09/22 02:41:25	1.10
--- pipermail.py	2000/09/22 18:23:37	1.11
***************
*** 59,68 ****
  # Abstract class for databases
  
! class Database:    
      def __init__(self): pass
      def close(self): pass
      def getArticle(self, archive, msgid): pass
      def hasArticle(self, archive, msgid): pass
!     def addArticle(self, archive, article, subjectkey, authorkey): pass
      def firstdate(self, archive): pass
      def lastdate(self, archive): pass
--- 59,69 ----
  # Abstract class for databases
  
! class DatabaseInterface:    
      def __init__(self): pass
      def close(self): pass
      def getArticle(self, archive, msgid): pass
      def hasArticle(self, archive, msgid): pass
!     def addArticle(self, archive, article, subject=None, author=None,
!                    date=None): pass
      def firstdate(self, archive): pass
      def lastdate(self, archive): pass
***************
*** 74,77 ****
--- 75,132 ----
      def getOldestArticle(self, subject): pass
  
+ class Database(DatabaseInterface):
+     """Define the basic sorting logic for a database
+ 
+     Assumes that the database internally uses dateIndex, authorIndex,
+     etc.
+     """
+ 
+     # TBD Factor out more of the logic shared between BSDDBDatabase
+     # and HyperDatabase and place it in this class.
+     
+     def __init__(self):
+         # This method need not be called by subclasses that do their
+         # own initialization.
+         self.dateIndex = {}
+         self.authorIndex = {}
+         self.subjectIndex = {}
+         self.articleIndex = {}
+         self.changed = {}
+     
+     def addArticle(self, archive, article, subject=None, author=None,
+                    date=None):
+         # create the keys; always end w/ msgid which will be unique
+         authorkey = (author or article.author, article.date,
+                      article.msgid)
+         subjectkey = (subject or article.subject, article.date,
+                       article.msgid)
+         datekey = date or article.date, article.msgid
+ 
+ 	# Add the new article
+ 	self.dateIndex[datekey] = article.msgid
+ 	self.authorIndex[authorkey] = article.msgid
+ 	self.subjectIndex[subjectkey] = article.msgid
+ 
+         self.store_article(article)
+ 	self.changed[archive, article.msgid] = None
+ 
+ 	parentID = article.parentID
+ 	if parentID is not None and self.articleIndex.has_key(parentID): 
+ 	    parent = self.getArticle(archive, parentID)
+ 	    myThreadKey = parent.threadKey + article.date + '-'
+ 	else:
+             myThreadKey = article.date + '-'
+ 	article.threadKey = myThreadKey
+         key = myThreadKey, article.msgid
+ 	self.setThreadKey(archive, key, article.msgid)
+ 
+     def store_article(self, article):
+         """Store article without message body to save space"""
+         # TBD this is not thread safe!
+ 	temp = article.body
+         article.body = []
+ 	self.articleIndex[article.msgid] = pickle.dumps(article)
+ 	article.body = temp
+ 
  # The Article class encapsulates a single posting.  The attributes 
  # are:
***************
*** 331,335 ****
                      article.threadKey = parent.threadKey+article.date+'-' 
                  self.database.setThreadKey(self.archive,
!                     article.threadKey + '\000' + article.msgid,
                      msgid)
  	    msgid = self.database.next(self.archive, 'date')
--- 386,390 ----
                      article.threadKey = parent.threadKey+article.date+'-' 
                  self.database.setThreadKey(self.archive,
!                     (article.threadKey, article.msgid),
                      msgid)
  	    msgid = self.database.next(self.archive, 'date')
***************
*** 461,467 ****
  	    self.add_article(a)
  
!     # Archive an Article object.
      def add_article(self, article):
- 	# Determine into what archives the article should be placed
  	archives = self.get_archives(article)
          if not archives:
--- 516,535 ----
  	    self.add_article(a)
  
!     def new_archive(self, archive, archivedir):
!         self.archives.append(archive)
!         self.update_TOC = 1
!         self.database.newArchive(archive)
!         # If the archive directory doesn't exist, create it
!         try:
!             os.stat(archivedir)
!         except os.error, errdata:
!             errno, errmsg = errdata
!             if errno == 2: 
!                 mkdir(archivedir, self.DIRMODE)
!             else:
!                 raise os.error, errdata
!         self.open_new_archive(archive, archivedir)
! 
      def add_article(self, article):
  	archives = self.get_archives(article)
          if not archives:
***************
*** 470,538 ****
              archives = [archives]
  
- 	# Add the article to each archive in turn
  	article.filename = filename = self.get_filename(article)
! 	temp = self.format_article(article) # Reformat the article
!         fmt = "Processing article #%s into archives %s"
! 	self.message(fmt % (article.sequence, archives))
! 	for i in archives:
! 	    self.archive = i
! 	    archivedir = os.path.join(self.basedir, i)
! 	    # If it's a new archive, create it
! 	    if i not in self.archives: 
! 		self.archives.append(i)
!                 self.update_TOC = 1
! 		self.database.newArchive(i)
! 		# If the archive directory doesn't exist, create it
! 		try:
!                     os.stat(archivedir)
! 		except os.error, errdata:
! 		    errno, errmsg = errdata
! 		    if errno == 2: 
! 			mkdir(archivedir, self.DIRMODE)
! 		    else:
!                         raise os.error, errdata
! 		self.open_new_archive(i, archivedir)
  		
  	    # Write the HTML-ized article
!             self.write_article(i, temp, os.path.join(archivedir,
!                                                      filename))  
  
! 	    authorkey = fixAuthor(article.author) + '\000' + article.date
! 	    subjectkey = string.lower(article.subject ) +'\000' + article.date
  
! 	    # Update parenting info
! 	    parentID = None
! 	    if article.in_reply_to:
!                 parentID = article.in_reply_to
! 	    elif article.references: 
! 		refs = self._remove_external_references(article.references)
!                 if refs:
!                     maxdate = self.database.getArticle(self.archive,
!                                                        refs[0])
!                     for ref in refs[1:]:
!                         a = self.database.getArticle(self.archive, ref)
!                         if a.date > maxdate.date:
!                             maxdate = a
! 		    parentID = maxdate.msgid
! 	    else:
! 		# Get the oldest article with a matching subject, and
! 		# assume this is a follow-up to that article
! 		parentID = self.database.getOldestArticle(self.archive,
!                                                           article.subject) 
! 
! 	    if parentID is not None \
!                and not self.database.hasArticle(self.archive, parentID): 
! 		parentID = None
! 	    article.parentID = parentID 
! 	    if parentID is not None:
! 		parent = self.database.getArticle(self.archive, parentID)
! 		article.threadKey = parent.threadKey + article.date + '-'
! 	    else:
                  article.threadKey = article.date + '-'
!             key = article.threadKey + '\000' + article.msgid
!    	    self.database.setThreadKey(self.archive, key, article.msgid)
! 	    self.database.addArticle(i, temp, subjectkey, authorkey)
! 	    if i not in self._dirty_archives: 
! 		self._dirty_archives.append(i)
  
      def write_article(self, index, article, path):
--- 538,597 ----
              archives = [archives]
  
  	article.filename = filename = self.get_filename(article)
! 	temp = self.format_article(article)
!         fmt = "Processing article #%s into archives %s: %s"
! 	self.message(fmt % (article.sequence, archives, article.subject))
! 	for arch in archives:
! 	    self.archive = arch # why do this???
! 	    archivedir = os.path.join(self.basedir, arch)
! 	    if arch not in self.archives:
!                 self.new_archive(arch, archivedir)
  		
  	    # Write the HTML-ized article
!             self.write_article(arch, temp, os.path.join(archivedir,
!                                                         filename))  
  
!             author = fixAuthor(article.author)
!             subject = string.lower(article.subject)
  
!             article.parentID = parentID = self.get_parent_info(arch, article)
!             if parentID:
!                 parent = self.database.getArticle(arch, parentID)
!                 article.threadKey = parent.threadKey + article.date + '-'
!             else:
                  article.threadKey = article.date + '-'
!             key = article.threadKey, article.msgid
!             
!    	    self.database.setThreadKey(arch, key, article.msgid)
! 	    self.database.addArticle(arch, temp, author=author,
!                                      subject=subject)
!             
! 	    if arch not in self._dirty_archives: 
! 		self._dirty_archives.append(arch)
! 
!     def get_parent_info(self, archive, article):
!         parentID = None
!         if article.in_reply_to:
!             parentID = article.in_reply_to
!         elif article.references: 
!             refs = self._remove_external_references(article.references)
!             if refs:
!                 maxdate = self.database.getArticle(archive, refs[0])
!                 for ref in refs[1:]:
!                     a = self.database.getArticle(archive, ref)
!                     if a.date > maxdate.date:
!                         maxdate = a
!                 parentID = maxdate.msgid
!         else:
!             # Get the oldest article with a matching subject, and
!             # assume this is a follow-up to that article
!             parentID = self.database.getOldestArticle(archive,
!                                                       article.subject) 
! 
!         if parentID and not self.database.hasArticle(archive, parentID): 
!             parentID = None
!         return parentID
!     
!         
  
      def write_article(self, index, article, path):
***************
*** 589,592 ****
--- 648,653 ----
  
  class BSDDBdatabase(Database):
+     __super_addArticle = Database.addArticle
+     
      def __init__(self, basedir):
  	self.__cachekeys = []
***************
*** 595,599 ****
  	self.basedir = os.path.expanduser(basedir)
  	self.changed = {} # Recently added articles, indexed only by
! 	                  # message ID 
      def firstdate(self, archive):
  	self.__openIndices(archive)
--- 656,661 ----
  	self.basedir = os.path.expanduser(basedir)
  	self.changed = {} # Recently added articles, indexed only by
! 	                  # message ID
!                           
      def firstdate(self, archive):
  	self.__openIndices(archive)
***************
*** 605,608 ****
--- 667,671 ----
              pass
  	return date
+     
      def lastdate(self, archive):
  	self.__openIndices(archive)
***************
*** 614,652 ****
              pass
  	return date
      def numArticles(self, archive):
  	self.__openIndices(archive)
  	return len(self.dateIndex)    
- 
-     # Add a single article to the internal indexes for an archive.
  
!     def addArticle(self, archive, article, subjectkey, authorkey):
! 	self.__openIndices(archive)
  
- 	# Add the new article
- 	self.dateIndex[article.date] = article.msgid
- 	self.authorIndex[authorkey] = article.msgid
- 	self.subjectIndex[subjectkey] = article.msgid
- 	# Set the 'body' attribute to empty, to avoid storing the
- 	# whole message 
- 	temp = article.body
-         article.body = []
- 	self.articleIndex[article.msgid] = pickle.dumps(article)
- 	article.body = temp
- 	self.changed[archive,article.msgid] = None
- 
- 	parentID = article.parentID
- 	if parentID is not None and self.articleIndex.has_key(parentID): 
- 	    parent = self.getArticle(archive, parentID)
- 	    myThreadKey = parent.threadKey+article.date + '-'
- 	else:
-             myThreadKey = article.date + '-'
- 	article.threadKey = myThreadKey
-         key = myThreadKey + '\000' + article.msgid
- 	self.setThreadKey(archive, key, article.msgid)
- 
      # Open the BSDDB files that are being used as indices
      # (dateIndex, authorIndex, subjectIndex, articleIndex)
      def __openIndices(self, archive):
! 	if self.__currentOpenArchive == archive: return
  
  	import bsddb
--- 677,695 ----
              pass
  	return date
+     
      def numArticles(self, archive):
  	self.__openIndices(archive)
  	return len(self.dateIndex)    
  
!     def addArticle(self, archive, article, subject=None, author=None,
!                    date=None):
!         self.__openIndices(archive)
!         self.__super_addArticle(archive, article, subject, author, date)
  
      # Open the BSDDB files that are being used as indices
      # (dateIndex, authorIndex, subjectIndex, articleIndex)
      def __openIndices(self, archive):
! 	if self.__currentOpenArchive == archive:
!             return
  
  	import bsddb
***************
*** 677,680 ****
--- 720,724 ----
  		delattr(self,attr)
  	self.__currentOpenArchive = None
+         
      def close(self):
  	self.__closeIndices()


_______________________________________________
Mailman-checkins mailing list
Mailman-checkins@python.org
http://www.python.org/mailman/listinfo/mailman-checkins

[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic