"""Convert NYTimes feeds to HTML. Code by Aaron Swartz. GNU GPL 2. Portions from http://diveintomark.org/public/nyt.txt by Mark Pilgrim.""" import urllib, cPickle as pickle, time, sys, re from xml.dom import minidom def text(node, name): rc = "" if not node.getElementsByTagName(name): return "" for child in node.getElementsByTagName(name)[0].childNodes: if not hasattr(child, 'data'): child.data = "" rc = rc + child.data return rc feeds = ['Africa', 'Americas', 'ArtandDesign', 'Arts', 'AsiaPacific', 'Automobiles', 'Baseball', 'Books', 'Business', 'Circuits', 'CollegeBasketball', 'CollegeFootball', 'DiningandWine', 'Education', 'Environment', 'Escapes', 'Europe', 'FashionandStyle', 'Golf', 'Health', 'HealthCarePolicy', 'Hockey', 'HomePage', 'HomeandGarden', 'International', 'JobMarket', 'Magazine', 'MediaandAdvertising', 'MetroCampaigns', 'MiddleEast', 'MovieNews', 'Movies', 'Multimedia', 'Music', 'NYRegion', 'National', 'Nutrition', 'Obituaries', 'Opinion', 'OtherSports', 'ProBasketball', 'ProFootball', 'Psychology', 'RealEstate', 'RedCarpet', 'Science', 'Soccer', 'Space', 'Sports', 'SundayBookReview', 'SundayStyles', 'Technology', 'Television', 'TheCity', 'Theater', 'ThursdayStyles', 'TimesSelect', 'Travel', 'Washington', 'Weddings', 'WeekinReview', 'WorldBusiness', 'YourMoney', 'pop_top'] # removed: 'Campaigns', 'Trail', PoguesPosts items = [] for feed in feeds: c = urllib.urlopen("http://www.nytimes.com/services/xml/rss/userland/%s.xml" % feed) c = minidom.parseString(c.read().replace('\x92', "'")).getElementsByTagName("channel")[0] items += [[text(node, "link"), text(node, "title"), text(node, "author").title().replace('By', 'by'), feed, text(node, "description"), time.time()] for node in c.getElementsByTagName("item")] fitems = pickle.load(open("db.pkl")) itemurls = [x[0] for x in fitems] newitemurls = [] for item in items: if item[0] not in (itemurls + newitemurls): newitemurls.append(item[0]) fitems = [item] + fitems def sorter(x): d = re.findall('http://www.nytimes.com/(\d\d\d\d/\d\d/\d\d)/', x[0]) if d: return d[0] + `x[5]` return time.strftime("%Y/%m/%d", time.localtime(x[5])) + `x[5]` fitems.sort(lambda y,x: cmp(sorter(x),sorter(y))) items = fitems[:1000] newnewitemurls = [] oldnewitemurls = [] itemurls = [x[0] for x in items] for url in newitemurls: if url in itemurls: newnewitemurls.append(url) else: oldnewitemurls.append(url) pickle.dump(items, open("nytimes.pkl", "w")) import dbm; from nytutils import cleanLink db = dbm.open('itemurls.dbm', 'w') for item in itemurls: if not db.has_key(cleanLink(item)): db[cleanLink(item)] = item open('urlarchive.txt', 'a').write('\n'.join(newnewitemurls)+'\n') if oldnewitemurls: open('olditemurls', 'a').write('\n'.join(oldnewitemurls)+'\n') print 'New York Times: Latest Headlines

New York Times: Latest Headlines

Webloggers: Check out our new link generator.

' for item in items: #item[5] = time.strftime("%b %e %l:%M:%S %p", time.strptime(item[5], "%Y%m%d%H%M%S")) item[5] = time.strftime("%l%p %e %b", time.localtime(item[5])) print ('

%s %s (%s)
%s %s

' % tuple(item)).encode('utf8') print '
Site by Aaron Swartz (source code); content by the New York Times.
'