[prev in list] [next in list] [prev in thread] [next in thread] 

List:       opencsw-devel
Subject:    Re: SF.net SVN: gar:[22910] csw/mgar/gar/v2/lib/python/compare_catalog.py
From:       Maciej_(Matchek)_BliziƄski <maciej () opencsw ! org>
Date:       2014-01-28 16:36:26
Message-ID: CALtRa-4KeAyn05HExaewfcuTAcF4UHZotooxExOzwPf-aQMh4Q () mail ! gmail ! com
[Download RAW message or body]

Hi Carsten,

More comments! I hope you'll be able to reduce the size of this script.

2014-01-28 <cgrzemba@users.sourceforge.net>

> Revision: 22910
>           http://sourceforge.net/p/gar/code/22910
> Author:   cgrzemba
> Date:     2014-01-28 16:20:56 +0000 (Tue, 28 Jan 2014)
> Log Message:
> -----------
> use argparse, add out of order pkg compare
>
> Modified Paths:
> --------------
>     csw/mgar/gar/v2/lib/python/compare_catalog.py
>
> Modified: csw/mgar/gar/v2/lib/python/compare_catalog.py
> ===================================================================
> --- csw/mgar/gar/v2/lib/python/compare_catalog.py       2014-01-28
> 12:36:44 UTC (rev 22909)
> +++ csw/mgar/gar/v2/lib/python/compare_catalog.py       2014-01-28
> 16:20:56 UTC (rev 22910)
> @@ -2,55 +2,108 @@
>
>  import cjson
>  import logging
> -import optparse
> +import argparse
>  import urllib2
>  import sys
> +import re
>
>  logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s')
>  logger = logging.getLogger(__name__)
>
> +remote_scheme = ['http','https']
> +local_scheme = ['file']
> +
> +def prepareCatListFromURI(uri):
> +    catlst = []
> +    if '://' in uri:
>

We can say that you have to have a valid URI that either starts with
http://or https://or file://


> +        scheme = uri.split(':')[0]
> +        if scheme in remote_scheme:
> +            logger.info("fetch remote %s", uri)
> +            data = urllib2.urlopen(uri).read()
>

Let's use the requests module. We have a package.

http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/rest.py#L250


> +            catlst = cjson.decode(data)
> +            for e in catlst:
> +                del e[9]
> +            return catlst
> +        elif scheme in local_scheme:
> +            uri = re.sub('.*://','',uri)
> +        else:
> +            logger.error('unsupported URI format')
> +            sys.exit(4)
> +    with open(uri) as lcat:
> +        logger.info("fetch local %s", uri)
> +        for line in lcat: # skip 4 lines header '# CREATIONDATE'
>

We already have a parser, please use it.
http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/catalog.py#L66


> +            if line.startswith("# CREATIONDATE"):
> +                break
> +        for line in lcat:
> +            if line.startswith("-----BEGIN PGP SIGNATURE"):
> +                break
> +            catlst.append(line.rstrip().split(' '))
> +    return catlst
> +
> +def compareOutOfOrder(a_catlst, b_catlst, idx):
> +    a_pkgName2Idx = {}
> +    i = idx
> +    for j in range(idx,len(a_catlst)):
> +        a_pkgName2Idx[a_catlst[j][0]] = j
> +    # import pdb; pdb.set_trace()
> +    while i < len(b_catlst):
> +        if b_catlst[i][0] in a_pkgName2Idx:
> +            if b_catlst[i] != a_catlst[a_pkgName2Idx[b_catlst[i][0]]]:
> +                logger.warning("pkgs different at {0},{1}: {2}
> {3}".format(i,a_pkgName2Idx[b_catlst[i][0]],a_catlst[a_pkgName2Idx[b_catlst[i][0]]],b_catlst[i]))
> +                sys.exit(1)
> +        else:
> +            logger.warning("not in acat: %s", b_catlst[i])
> +            sys.exit(1)
> +        i += 1
> +    b_pkgName2Idx = {}
> +    for j in range(idx,len(b_catlst)):
> +        b_pkgName2Idx[b_catlst[j][0]] = j
> +    # import pdb; pdb.set_trace()
> +    i = idx
> +    while i < len(a_catlst):
> +        if a_catlst[i][0] not in b_pkgName2Idx:
> +            logger.warning("not in bcat: %s", a_catlst[i])
> +            sys.exit(1)
> +        i += 1
>

Why not convert both to a data structure consisting of basic types: nested
lists and dicts? Then you can just compare them using the == operator. If
you wanted some diagnostic output to display the difference, you can always
serialize them and display the textual diff - it will save you lots of
lines of code.


>  def main():
> -    parser = optparse.OptionParser()
> -    parser.add_option("-v","--verbose", dest="verbose",
> action="store_true",default=False)
> -    parser.add_option("-a","--existing-catalog", dest="oldcatalog",
> -                    help='set URI of existing catalog', metavar =
> 'catalog')
> -    parser.add_option("-b","--new-catalog", dest="newcatalog",
> -                    help='set URI of catalog to generate', metavar =
> 'catalog')
> -    options, args = parser.parse_args()
> +    parser = argparse.ArgumentParser()
> +    parser.add_argument("-v","--verbose", dest="verbose",
> action="store_true",default=False)
> +    parser.add_argument("acat",help="catalog URI")
> +    parser.add_argument("bcat",help="catalog URI")
> +    args = parser.parse_args()
>      opterror = False
> -    if options.verbose:
> +    if args.verbose:
>          logger.setLevel(logging.INFO)
> -    if options.debug:
> -        logger.setLevel(logging.DEBUG)
> -    if options.newcatalog is None or options.oldcatalog is None:
> -        logger.error("mandatory option missing")
> +    if args.acat is None or args.bcat is None:
> +        logger.error("mandatory args 'acat' 'bcat' missing")
>          sys.exit(2)
> -    oldcat = options.oldcatalog
> -    newcat = options.newcatalog
> -    logger.info(" compare %s with %s", oldcat, newcat)
>
> -    data = urllib2.urlopen(oldcat).read()
> -    a_catlst = cjson.decode(data)
> -    for e in a_catlst:
> -        del e[9]
> -    b_catlst = []
> -    with open(newcat) as nc:
> -        for i in range(4): # skip 4 lines header
> -            nc.readline()
> -        for cl in nc.readlines():
> -            if "-----BEGIN" == cl.split(' ')[0]:
> -                break
> -            b_catlst.append(cl.rstrip().split(' '))
> +    logger.info("fetch cat_a %s", args.acat)
> +    a_catlst = prepareCatListFromURI(args.acat)
> +
> +    logger.info("fetch cat_b %s", args.bcat)
> +    b_catlst = prepareCatListFromURI(args.bcat)
> +
> +    logger.info("compare ...")
>      if len(a_catlst) != len(b_catlst):
> -        logger.warning("a has %d, b has %d
> packges",len(a_catlst),len(b_catlst))
> -        sys.exit(1)
> +        logger.warning("a has %d, b has %d
> packages",len(a_catlst),len(b_catlst))
> +        # sys.exit(1)
>      for i in range(len(b_catlst)):
> -        if b_catlst[i] != a_catlst[i] :
> -            logger.warning("a is {0}, b is
> {1}".format(a_catlst[i],b_catlst[i]))
> -            sys.exit(1)
> +        try:
> +            if b_catlst[i] != a_catlst[i] :
> +                if b_catlst[i][0] != a_catlst[i][0]:
> +                    logger.warning("packages out of order: A: %s; B:
> %s",a_catlst[i][0], b_catlst[i][0])
>

Hm, what I meant is that out of order comparing:

1. should just work
2. should not be a special case

The code should use such data structures that the ordering doesn't matter.
For example, if you use a dict, then the ordering doesn't matter:

>>> a = dict([('a', 1), ('b', 2)])
>>> b = dict([('b', 2), ('a', 1)])
>>> a == b
True



> +                    compareOutOfOrder(a_catlst, b_catlst, i)
> +                    break
> +                else:
> +                    logger.warning("pkgs different: {0}
> {1}".format(a_catlst[i],b_catlst[i]))
> +                    sys.exit(1)
> +        except IndexError as e:
> +            logger.info("package %s not in acat", b_catlst[i])
>
>      # import pdb; pdb.set_trace()
> -    logger.debug("catalogs are same")
> +    logger.info("catalogs are same")
>      sys.exit(0)
>
>
>
> This was sent by the SourceForge.net collaborative development platform,
> the world's largest Open Source development site.
>
>

[Attachment #3 (text/html)]

<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">Hi Carsten,  \
</div><div class="gmail_quote"><br></div><div class="gmail_quote">More comments! I \
hope you&#39;ll be able to reduce the size of this script.</div>

<div class="gmail_quote"><br></div><div class="gmail_quote">2014-01-28  <span \
dir="ltr">&lt;<a href="mailto:cgrzemba@users.sourceforge.net" \
target="_blank">cgrzemba@users.sourceforge.net</a>&gt;</span><br><blockquote \
class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">


Revision: 22910<br>
               <a href="http://sourceforge.net/p/gar/code/22910" \
                target="_blank">http://sourceforge.net/p/gar/code/22910</a><br>
Author:    cgrzemba<br>
Date:       2014-01-28 16:20:56 +0000 (Tue, 28 Jan 2014)<br>
Log Message:<br>
-----------<br>
use argparse, add out of order pkg compare<br>
<br>
Modified Paths:<br>
--------------<br>
      csw/mgar/gar/v2/lib/python/compare_catalog.py<br>
<br>
Modified: csw/mgar/gar/v2/lib/python/compare_catalog.py<br>
===================================================================<br>
--- csw/mgar/gar/v2/lib/python/compare_catalog.py          2014-01-28 12:36:44 UTC \
                (rev 22909)<br>
+++ csw/mgar/gar/v2/lib/python/compare_catalog.py          2014-01-28 16:20:56 UTC \
(rev 22910)<br> @@ -2,55 +2,108 @@<br>
<br>
  import cjson<br>
  import logging<br>
-import optparse<br>
+import argparse<br>
  import urllib2<br>
  import sys<br>
+import re<br>
<br>
  logging.basicConfig(format=&#39;%(asctime)s %(levelname)s:%(message)s&#39;)<br>
  logger = logging.getLogger(__name__)<br>
<br>
+remote_scheme = [&#39;http&#39;,&#39;https&#39;]<br>
+local_scheme = [&#39;file&#39;]<br>
+<br>
+def prepareCatListFromURI(uri):<br>
+      catlst = []<br>
+      if &#39;://&#39; in uri:<br></blockquote><div><br></div><div>We can say that \
you have to have a valid URI that either starts with http:// or https:// or \
file://</div><div>  </div><blockquote class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">



+            scheme = uri.split(&#39;:&#39;)[0]<br>
+            if scheme in remote_scheme:<br>
+                  <a href="http://logger.info" \
target="_blank">logger.info</a>(&quot;fetch remote %s&quot;, uri)<br> +               \
data = urllib2.urlopen(uri).read()<br></blockquote><div><br></div><div>Let&#39;s use \
the requests module. We have a package.</div><div><br></div><div><a \
href="http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/rest.py# \
L250">http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/rest.py#L250</a><br>


</div><div>  </div><blockquote class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
 +                  catlst = cjson.decode(data)<br>
+                  for e in catlst:<br>
+                        del e[9]<br>
+                  return catlst<br>
+            elif scheme in local_scheme:<br>
+                  uri = re.sub(&#39;.*://&#39;,&#39;&#39;,uri)<br>
+            else:<br>
+                  logger.error(&#39;unsupported URI format&#39;)<br>
+                  sys.exit(4)<br>
+      with open(uri) as lcat:<br>
+            <a href="http://logger.info" target="_blank">logger.info</a>(&quot;fetch \
local %s&quot;, uri)<br> +            for line in lcat: # skip 4 lines header &#39;# \
CREATIONDATE&#39;<br></blockquote><div><br></div><div>We already have a parser, \
please use it.</div><div><a \
href="http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/catalog. \
py#L66">http://sourceforge.net/apps/trac/gar/browser/csw/mgar/gar/v2/lib/python/catalog.py#L66</a><br>


</div><div>  </div><blockquote class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
 +                  if line.startswith(&quot;# CREATIONDATE&quot;):<br>
+                        break<br>
+            for line in lcat:<br>
+                  if line.startswith(&quot;-----BEGIN PGP SIGNATURE&quot;):<br>
+                        break<br>
+                  catlst.append(line.rstrip().split(&#39; &#39;))<br>
+      return catlst<br>
+<br>
+def compareOutOfOrder(a_catlst, b_catlst, idx):<br>
+      a_pkgName2Idx = {}<br>
+      i = idx<br>
+      for j in range(idx,len(a_catlst)):<br>
+            a_pkgName2Idx[a_catlst[j][0]] = j<br>
+      # import pdb; pdb.set_trace()<br>
+      while i &lt; len(b_catlst):<br>
+            if b_catlst[i][0] in a_pkgName2Idx:<br>
+                  if b_catlst[i] != a_catlst[a_pkgName2Idx[b_catlst[i][0]]]:<br>
+                        logger.warning(&quot;pkgs different at {0},{1}: {2} \
{3}&quot;.format(i,a_pkgName2Idx[b_catlst[i][0]],a_catlst[a_pkgName2Idx[b_catlst[i][0]]],b_catlst[i]))<br>
 +                        sys.exit(1)<br>
+            else:<br>
+                  logger.warning(&quot;not in acat: %s&quot;, b_catlst[i])<br>
+                  sys.exit(1)<br>
+            i += 1<br>
+      b_pkgName2Idx = {}<br>
+      for j in range(idx,len(b_catlst)):<br>
+            b_pkgName2Idx[b_catlst[j][0]] = j<br>
+      # import pdb; pdb.set_trace()<br>
+      i = idx<br>
+      while i &lt; len(a_catlst):<br>
+            if a_catlst[i][0] not in b_pkgName2Idx:<br>
+                  logger.warning(&quot;not in bcat: %s&quot;, a_catlst[i])<br>
+                  sys.exit(1)<br>
+            i += 1<br>
</blockquote><div><br></div><div>Why not convert both to a data structure consisting \
of basic types: nested lists and dicts? Then you can just compare them using the == \
operator. If you wanted some diagnostic output to display the difference, you can \
always serialize them and display the textual diff - it will save you lots of lines \
of code.</div>

<div>  </div><blockquote class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
  def main():<br>
-      parser = optparse.OptionParser()<br>
-      parser.add_option(&quot;-v&quot;,&quot;--verbose&quot;, \
                dest=&quot;verbose&quot;, \
                action=&quot;store_true&quot;,default=False)<br>
-      parser.add_option(&quot;-a&quot;,&quot;--existing-catalog&quot;, \
                dest=&quot;oldcatalog&quot;,<br>
-                              help=&#39;set URI of existing catalog&#39;, metavar = \
                &#39;catalog&#39;)<br>
-      parser.add_option(&quot;-b&quot;,&quot;--new-catalog&quot;, \
                dest=&quot;newcatalog&quot;,<br>
-                              help=&#39;set URI of catalog to generate&#39;, metavar \
                = &#39;catalog&#39;)<br>
-      options, args = parser.parse_args()<br>
+      parser = argparse.ArgumentParser()<br>
+      parser.add_argument(&quot;-v&quot;,&quot;--verbose&quot;, \
dest=&quot;verbose&quot;, action=&quot;store_true&quot;,default=False)<br> +      \
parser.add_argument(&quot;acat&quot;,help=&quot;catalog URI&quot;)<br> +      \
parser.add_argument(&quot;bcat&quot;,help=&quot;catalog URI&quot;)<br> +      args = \
parser.parse_args()<br>  opterror = False<br>
-      if options.verbose:<br>
+      if args.verbose:<br>
              logger.setLevel(logging.INFO)<br>
-      if options.debug:<br>
-            logger.setLevel(logging.DEBUG)<br>
-      if options.newcatalog is None or options.oldcatalog is None:<br>
-            logger.error(&quot;mandatory option missing&quot;)<br>
+      if args.acat is None or args.bcat is None:<br>
+            logger.error(&quot;mandatory args &#39;acat&#39; &#39;bcat&#39; \
missing&quot;)<br>  sys.exit(2)<br>
-      oldcat = options.oldcatalog<br>
-      newcat = options.newcatalog<br>
-      <a href="http://logger.info" target="_blank">logger.info</a>(&quot; compare %s \
with %s&quot;, oldcat, newcat)<br> <br>
-      data = urllib2.urlopen(oldcat).read()<br>
-      a_catlst = cjson.decode(data)<br>
-      for e in a_catlst:<br>
-            del e[9]<br>
-      b_catlst = []<br>
-      with open(newcat) as nc:<br>
-            for i in range(4): # skip 4 lines header<br>
-                  nc.readline()<br>
-            for cl in nc.readlines():<br>
-                  if &quot;-----BEGIN&quot; == cl.split(&#39; &#39;)[0]:<br>
-                        break<br>
-                  b_catlst.append(cl.rstrip().split(&#39; &#39;))<br>
+      <a href="http://logger.info" target="_blank">logger.info</a>(&quot;fetch cat_a \
%s&quot;, args.acat)<br> +      a_catlst = prepareCatListFromURI(args.acat)<br>
+<br>
+      <a href="http://logger.info" target="_blank">logger.info</a>(&quot;fetch cat_b \
%s&quot;, args.bcat)<br> +      b_catlst = prepareCatListFromURI(args.bcat)<br>
+<br>
+      <a href="http://logger.info" target="_blank">logger.info</a>(&quot;compare \
...&quot;)<br>  if len(a_catlst) != len(b_catlst):<br>
-            logger.warning(&quot;a has %d, b has %d \
                packges&quot;,len(a_catlst),len(b_catlst))<br>
-            sys.exit(1)<br>
+            logger.warning(&quot;a has %d, b has %d \
packages&quot;,len(a_catlst),len(b_catlst))<br> +            # sys.exit(1)<br>
        for i in range(len(b_catlst)):<br>
-            if b_catlst[i] != a_catlst[i] :<br>
-                  logger.warning(&quot;a is {0}, b is \
                {1}&quot;.format(a_catlst[i],b_catlst[i]))<br>
-                  sys.exit(1)<br>
+            try:<br>
+                  if b_catlst[i] != a_catlst[i] :<br>
+                        if b_catlst[i][0] != a_catlst[i][0]:<br>
+                              logger.warning(&quot;packages out of order: A: %s; B: \
%s&quot;,a_catlst[i][0], b_catlst[i][0])<br></blockquote><div><br></div><div>Hm, what \
I meant is that out of order comparing:</div><div><br></div>

<div>1. should just work</div><div>2. should not be a special \
case</div><div><br></div><div>The code should use such data structures that the \
ordering doesn&#39;t matter. For example, if you use a dict, then the ordering \
doesn&#39;t matter:</div>

<div><br></div><div><div>&gt;&gt;&gt; a = dict([(&#39;a&#39;, 1), (&#39;b&#39;, \
2)])</div><div>&gt;&gt;&gt; b = dict([(&#39;b&#39;, 2), (&#39;a&#39;, \
1)])</div><div>&gt;&gt;&gt; a == b</div><div>True</div></div><div><br> </div>
<div>  </div><blockquote class="gmail_quote" style="margin:0px 0px 0px \
0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
 +                              compareOutOfOrder(a_catlst, b_catlst, i)<br>
+                              break<br>
+                        else:<br>
+                              logger.warning(&quot;pkgs different: {0} \
{1}&quot;.format(a_catlst[i],b_catlst[i]))<br> +                              \
sys.exit(1)<br> +            except IndexError as e:<br>
+                  <a href="http://logger.info" \
target="_blank">logger.info</a>(&quot;package %s not in acat&quot;, b_catlst[i])<br> \
<br>  # import pdb; pdb.set_trace()<br>
-      logger.debug(&quot;catalogs are same&quot;)<br>
+      <a href="http://logger.info" target="_blank">logger.info</a>(&quot;catalogs \
are same&quot;)<br>  sys.exit(0)<br>
<br>
<br>
<br>
This was sent by the SourceForge.net collaborative development platform, the \
world&#39;s largest Open Source development site.<br> <br>
</blockquote></div><br></div></div>



[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic