[prev in list] [next in list] [prev in thread] [next in thread] 

List:       pywikipediabot-users
Subject:    [Pywikipedia-l] SVN:  [5283] branches/rewrite/pywikibot
From:       russblau () mayflower ! knams ! wikimedia ! org
Date:       2008-04-29 16:18:53
Message-ID: E1JqsXd-0006Ca-T7 () lily ! knams ! wikimedia ! org
[Download RAW message or body]

Revision: 5283
Author:   russblau
Date:     2008-04-29 16:18:53 +0000 (Tue, 29 Apr 2008)

Log Message:
-----------
Added preloadpages method for Site; fixed bugs.

Modified Paths:
--------------
    branches/rewrite/pywikibot/data/api.py
    branches/rewrite/pywikibot/site.py
    branches/rewrite/pywikibot/throttle.py
    branches/rewrite/pywikibot/tools.py

Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py	2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/data/api.py	2008-04-29 16:18:53 UTC (rev 5283)
@@ -93,20 +93,6 @@
         self.params = {}
         if "action" not in kwargs:
             raise ValueError("'action' specification missing from Request.")
-        if kwargs["action"] == 'query':
-            if "meta" in kwargs:
-                if "userinfo" not in kwargs["meta"]:
-                    kwargs["meta"] += "|userinfo"
-            else:
-                kwargs["meta"] = "userinfo"
-            if "uiprop" in kwargs:
-                kwargs["uiprop"] += "|blockinfo|hasmsg"
-            else:
-                kwargs["uiprop"] = "blockinfo|hasmsg"
-        if "format" not in kwargs:
-            self.params["format"] = "json"
-        if "maxlag" not in kwargs:
-            self.params["maxlag"] = str(config.maxlag)
         self.update(**kwargs)
 
     # implement dict interface
@@ -138,10 +124,32 @@
         
         """
         from pywikibot.comms import http
-        if self.params['format'] != 'json':
+
+        for key in self.params:
+            if isinstance(self.params[key], basestring):
+                self.params[key] = self.params[key].split("|")
+        if self.params["action"] == ['query']:
+            meta = self.params.get("meta", [])
+            if "userinfo" not in meta:
+                meta.append("userinfo")
+                self.params["meta"] = meta
+            uiprop = self.params.get("uiprop", [])
+            uiprop = set(uiprop + ["blockinfo", "hasmsg"])
+            self.params["uiprop"] = list(uiprop)
+            if "properties" in self.params:
+                if "info" in self.params["properties"]:
+                    inprop = self.params.get("inprop", [])
+                    info = set(info + ["protection", "talkid", "subjectid"])
+                    self.params["info"] = list(info)
+        if "maxlag" not in self.params:
+            self.params["maxlag"] = [str(config.maxlag)]
+        if "format" not in self.params:
+            self.params["format"] = ["json"]
+        if self.params['format'] != ["json"]:
             raise TypeError("Query format '%s' cannot be parsed."
                             % self.params['format'])
         for key in self.params:
+            self.params[key] = "|".join(self.params[key])
             if isinstance(self.params[key], unicode):
                 self.params[key] = self.params[key].encode(self.site.encoding())
         params = urllib.urlencode(self.params)
@@ -353,12 +361,17 @@
         @type prop: str
 
         """
-        self.request = Request(action="query", prop=prop, **kwargs)
-        if prop not in self.limits:
-            raise ValueError("Unrecognized property '%s'" % prop)
+        if isinstance(prop, basestring):
+            prop = prop.split("|")
+        for p in prop:
+            if p not in self.limits:
+                raise ValueError("Unrecognized property '%s'" % p)
+        self.request = Request(action="query", prop="|".join(prop))
         # set limit to max, if applicable
-        if self.limits[prop] and kwargs.pop("getAll", False):
-            self.request['g'+self.limits[generator]] = "max"
+        for p in prop:
+            if self.limits[p] and kwargs.pop("getAll", False):
+                self.request['g'+self.limits[generator]] = "max"
+        self.request.params.update(kwargs)
         self.site = self.request.site
         self.resultkey = prop
 

Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py	2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/site.py	2008-04-29 16:18:53 UTC (rev 5283)
@@ -500,10 +500,8 @@
     def getpageinfo(self, page):
         """Load page info from api and save in page attributes"""
         title = page.title(withSection=False)
-        query = api.PropertyGenerator(
-                    "info",
-                    inprop="protection|talkid|subjectid",
-                    titles=title.encode(self.encoding()))
+        query = api.PropertyGenerator("info",
+                                      titles=title.encode(self.encoding()))
         for pageitem in query:
             if pageitem['title'] != title:
                 raise Error(
@@ -580,6 +578,66 @@
             api.update_page(target, pagedata)
             page._redir = target
 
+    def preloadpages(self, pagelist, size=60, lookahead=0):
+        """Return a generator to a list of preloaded pages.
+
+        @param pagelist: an iterable that returns Page objects
+        @param size: how many Pages to query at a time
+        @type size: int
+        @param lookahead: if greater than zero, preload pages in a
+            separate thread for greater responsiveness; higher values
+            result in more aggressive preloading
+        @type lookahead: int
+
+        """
+        from pywikibot.tools import itergroup, ThreadedGenerator
+        gen = ThreadedGenerator(target=itergroup,
+                                args=(pagelist, size),
+                                qsize=lookahead)
+        try:
+            for sublist in gen:
+                pageids = []
+                cache = {}
+                for p in sublist:
+                    if pageids is not None:
+                        if hasattr(p, "_pageid"):
+                            pageids.append(str(p._pageid))
+                        else:
+                            # only use pageids if all pages have them
+                            pageids = None
+                    cache[p.title(withSection=False)] = p
+                rvgen = api.PropertyGenerator("revisions|info")
+                if pageids is not None:
+                    rvgen.request["pageids"] = "|".join(pageids)
+                else:
+                    rvgen.request["titles"] = "|".join(cache.keys())
+                rvgen.request[u"rvprop"] = \
+                        u"ids|flags|timestamp|user|comment|content"
+                for pagedata in rvgen:
+                    if pagedata['title'] not in cache:
+                        raise Error(
+                        u"preloadpages: Query returned unexpected title '%s'"
+                             % pagedata['title']
+                        )
+                    page = cache[pagedata['title']]
+                    api.update_page(page, pagedata)
+                    if 'revisions' in pagedata: # true if page exists
+                        for rev in pagedata['revisions']:
+                            revision = pywikibot.page.Revision(
+                                                revid=rev['revid'],
+                                                timestamp=rev['timestamp'],
+                                                user=rev['user'],
+                                                anon=rev.has_key('anon'),
+                                                comment=rev.get('comment',  u''),
+                                                minor=rev.has_key('minor'),
+                                                text=rev.get('*', None)
+                                       )
+                            page._revisions[revision.revid] = revision
+                            page._revid = revision.revid
+                    yield page
+        finally:
+            gen.stop()
+
     # following group of methods map more-or-less directly to API queries
 
     def getbacklinks(self, page, followRedirects=False, filterRedirects=None,
@@ -819,7 +877,8 @@
             else:
                 page = Page(self, pagedata['title'])
             api.update_page(page, pagedata)
-
+            if 'revisions' not in pagedata:
+                continue
             for rev in pagedata['revisions']:
                 revision = pywikibot.page.Revision(
                                             revid=rev['revid'],
@@ -849,6 +908,8 @@
                 raise Error(
                     u"getlanglinks: Query on %s returned data on '%s'"
                     % (page, pageitem['title']))
+            if 'langlinks' not in pageitem:
+                continue
             for linkdata in pageitem['langlinks']:
                 yield pywikibot.Link(linkdata['*'],
                                      source=pywikibot.Site(linkdata['lang']))
@@ -864,6 +925,8 @@
                 raise RuntimeError(
                     "getlanglinks: Query on %s returned data on '%s'"
                     % (page, pageitem['title']))
+            if 'extlinks' not in pageitem:
+                continue
             for linkdata in pageitem['extlinks']:
                 yield linkdata['*']
 

Modified: branches/rewrite/pywikibot/throttle.py
===================================================================
--- branches/rewrite/pywikibot/throttle.py	2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/throttle.py	2008-04-29 16:18:53 UTC (rev 5283)
@@ -107,7 +107,7 @@
             f.close()
             self.process_multiplicity = count
             if self.verbosedelay:
-                pywikibot.output(
+                logging.info(
                 u"Found %s processes running, including the current process."
                     % count)
         finally:
@@ -216,10 +216,10 @@
             self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
             # Announce the delay if it exceeds a preset limit
             if waittime > config.noisysleep:
-                pywikibot.output(u"Sleeping for %.1f seconds, %s"
-                                 % (waittime,
-                                    time.strftime("%Y-%m-%d %H:%M:%S",
-                                                  time.localtime()))
+                logging.warn(u"Sleeping for %.1f seconds, %s"
+                              % (waittime,
+                                 time.strftime("%Y-%m-%d %H:%M:%S",
+                                               time.localtime()))
                                  )
             time.sleep(waittime)
             if write:

Modified: branches/rewrite/pywikibot/tools.py
===================================================================
--- branches/rewrite/pywikibot/tools.py	2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/tools.py	2008-04-29 16:18:53 UTC (rev 5283)
@@ -26,13 +26,14 @@
     all the generated values, it must call the generator's stop() method to
     stop the background thread.  Example usage:
 
-    >>> gen = ThreadedGenerator(target=foo)
+    >>> gen = ThreadedGenerator(target=xrange, args=(20,))
     >>> try:
     ...     for data in gen:
-    ...         do_work(data)
+    ...         print data,
     ... finally:
     ...     gen.stop()
-
+    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+    
     """
 
     def __init__(self, group=None, target=None, name="GeneratorThread",
@@ -95,3 +96,36 @@
         self.stop()
 
 
+def itergroup(iterable, size):
+    """Make an iterator that returns lists of (up to) size items from iterable.
+
+    Example:
+
+    >>> i = itergroup(xrange(25), 10)
+    >>> print i.next()
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> print i.next()
+    [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    >>> print i.next()
+    [20, 21, 22, 23, 24]
+    >>> print i.next()
+    Traceback (most recent call last):
+     ...
+    StopIteration
+
+    """
+    chunk = []
+    for item in iter(iterable):
+        chunk.append(item)
+        if len(chunk) == size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+
+            
+if __name__ == "__main__":
+    def _test():
+        import doctest
+        doctest.testmod()
+    _test()



_______________________________________________
Pywikipedia-l mailing list
Pywikipedia-l@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-l
[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic