py_kents_tutor.py

#!/usr/bin/python
''' Counts all posts to Python-tutor by author'''
# -*- coding: latin-1 -*-
from datetime import date, timedelta
import operator, urllib2
from BeautifulSoup import BeautifulSoup

today = date.today()

for year in [2008]:
    startDate = date(year, 1, 1)
    endDate = date(year, 12, 31)
    thirtyOne = timedelta(days=31)
    counts = {}

    # Collect all the counts for a year by scraping the monthly author archive pages
    while startDate < endDate and startDate < today:
        dateString = startDate.strftime('%Y-%B')

#        url = 'http://mail.python.org/pipermail/tutor/%s/author.html' % dateString
        url = 'http://lists.digium.com/pipermail/asterisk-users/%s/author.html' % dateString
        data = urllib2.urlopen(url).read()
        soup = BeautifulSoup(data)

        li = soup.findAll('li')[2:-2]

        for l in li:
            name = l.i.string.strip()
            counts[name] = counts.get(name, 0) + 1

        startDate += thirtyOne

    # Consolidate names that vary by case under the most popular spelling
    nameMap = dict() # Map lower-case name to most popular name
    for name, count in sorted(counts.iteritems(), 
            key=operator.itemgetter(1), reverse=True):
       lower = name.lower()
       if lower in nameMap:
          # Add counts for a name we have seen already
          counts[nameMap[lower]] += count
       else:
          nameMap[lower] = name

    print
    print year
    print '===='
    for name, count in sorted(counts.iteritems(), 
            key=operator.itemgetter(1), reverse=True)[:20]:
        print name.encode('latin-1', 'xmlcharrefreplace'), count
    print