Hi, Apologies if this is against etiquette. I've just got my first python app up and running. It is a podcast aggregator depending on feedparser. I've really only learnt enough to get this up and running. Any tips on the code quality and use of python would be appreciated. I've got a feeling the overall structure is up the creek. approx 220 LOC. file: GodCast.py Cheers, Lex. #!/usr/bin/python # GodCast: podcast aggregator! # depends on wget & lynx # * one of the main features of GodCast is it's use of bandwidth. # Many podcatchers # http://www.faqts.com/knowledge_base/view.phtml/aid/422/fid/17 # TODO: not found log # TODO: # config file # opml feed list? # pygtk/pyqt/qtkde gui? # possible flags: test, print but don't actual do anything import re, feedparser, os, sys, shutil, time, getopt import urllib2 import urllib import md5 boz = "" HOME = os.path.expanduser("~") # user configurable #maxChecksPerDay = 8 #maxChecksPerDay = 12 maxChecksPerDay = 24 myTemp = '/tmp' #podDir = os.path.join(HOME, 'Audio/Podcasts') podDir = os.path.join(HOME, 'Podcasts') # end user configurable downDir = os.path.join(myTemp, 'Podcasts') dotDir = os.path.join(HOME, '.aGodCast') logFile = os.path.join(dotDir, 'log') #list of downloaded urls cacheDir = os.path.join(dotDir, 'cache') ignoreNotFound = False # if true, add files not found to log # list of feeds, ignore lines not beginning ^http feedList = os.path.join(dotDir, 'feeds.txt') def exitFunc(): #f.close() #log.close() if boz: print boz def makeDirs(*dirs): for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) # render is used because feeds use a lot of html, not just plain text. def render(html): if html: html = re.sub('"', '\\"', html.encode('utf8')) #command = 'echo "' + html + '" | w3m -dump -T text/html' #command = 'echo "' + html + '" | html2text' command = 'echo "' + html + '" | lynx -dump -stdin -force_html' os.system(command) def localMD5(url): hash = md5.new(url).hexdigest() + '.xml' #unique name from url return os.path.join(cacheDir, hash) def cache(url): max = 60 * 60 * 24 / maxChecksPerDay #seconds myfile = localMD5(url) if os.path.isfile(myfile): elapsed = int(time.time()) - os.path.getmtime(myfile) if elapsed <= max: return print "FETCHING:", url + ' ...' urllib.urlretrieve(url, myfile) # handle half finish? def updateCache(feeds): l = [] print "updating local xml cache..." for feed in file(feeds, "r").read().split('\n'): if not re.match('^http://', feed): # feedList ignores anything but urls continue # TODO: handle whitespace, strip trailing cache(feed) l.append([localMD5(feed), feed]) print "cache up to date" return l def geturl(url): try: redir = urllib2.urlopen(url).geturl() except urllib2.HTTPError, e: if e.code != 404: print url print "geturl HTTPError:", e.code return e.code except urllib2.URLError, e: # (110, 'Connection timed out') print e.reason #print "geturl URLError:", e.code else: return redir return 0 def htmlTitle(mainTitle, subTitle): s = '

' s += '

' + mainTitle + '

' s += '

' + subTitle + '

' return s def downloadPod(url, dest): kb = 2 success = 0 command = 'wget --continue -O "' + dest + '" "' + url + '"' status = os.system(command) if status == success: return True else: print "\nWGET:", status if status == kb: pass #raise KeyboardInterrupt return False def downloadQueue(q, latest): for x in range(latest): for [feedTitle, castList] in q: if not len(castList) > x: continue cast = castList[x] if cast is None: continue url = cast.enclosures[0]['href'] redirect = geturl(url) # TRAFFIC if type(redirect) != int: #success render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title)) render(cast.description) podFile = os.path.basename(redirect).split('?')[0] permDir = os.path.join(podDir, feedTitle) permFile = os.path.join(permDir, podFile) tempDir = os.path.join(downDir, feedTitle) tempFile = os.path.join(tempDir, podFile) if not os.path.isfile(permFile): makeDirs(tempDir, permDir) if downloadPod(redirect, tempFile): # TRAFFIC shutil.move(tempFile, permFile) log(url) else: print "EXITING" sys.exit(2) else: render("
*** ON HARD-DRIVE ***") log(url) elif redirect == 404: print 'NOT FOUND:', url if ignoreNotFound: print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n' log(url) else: sys.exit(2) def log(url): file(logFile, 'a').write(url + "\n") def main(args): sys.exitfunc = exitFunc makeDirs(dotDir, podDir, downDir, cacheDir) #make file if doesn't exist, may be better solution? X = file(logFile, 'a') latest = 13 #get the first x casts for each feed try: opts, args = getopt.getopt(sys.argv[1:], "l:", ["latest=", "notfound"]) except getopt.GetoptError: sys.exit(2) #usage() for opt, arg in opts: if opt in ("-l", "--latest"): latest = int(arg) elif opt in ("--notfound"): ignoreNotFound = True #add notfound files to log Q = [] for [xmlFile, url] in updateCache(feedList): output = "" xml = feedparser.parse(xmlFile) if xml.channel.has_key('title'): #skip dodgy feeds itemQ= [] for item in xml['items'][:latest]: if item.has_key('enclosures'): podURL = item.enclosures[0]['href'] #check if url in log if file(logFile, 'r').read().find(podURL) < 0: itemQ.append(item) output += htmlTitle(xml.channel.title, item.title) output += item.description else: itemQ.append(None) Q.append([xml.channel.title, itemQ]) else: print "DODGY FEED:", url if xml.bozo: boz += "BOZO: " + xml.bozo_exception.getMessage() + "\t" + url sys.exit(2) #time.sleep(1) # allow ctrl+c #continue render(output) if Q is not None: render('

DOWNLOADING QUEUE

') downloadQueue(Q, latest) ###################################################### if __name__=="__main__": main(sys.argv) -- http://mail.python.org/mailman/listinfo/python-list