"""Convert NYTimes feeds to HTML.
Code by Aaron Swartz. GNU GPL 2.
Portions from http://diveintomark.org/public/nyt.txt by Mark Pilgrim."""

import urllib, cPickle as pickle, time, sys, re
from xml.dom import minidom

def text(node, name):
    rc = ""
    if not node.getElementsByTagName(name): return ""
    for child in node.getElementsByTagName(name)[0].childNodes:
        if not hasattr(child, 'data'): child.data = ""
    	rc = rc + child.data
    return rc

feeds = ['Africa', 'Americas', 'ArtandDesign', 'Arts', 'AsiaPacific', 'Automobiles', 'Baseball', 'Books', 'Business', 'Circuits', 'CollegeBasketball', 'CollegeFootball', 'DiningandWine', 'Education', 'Environment', 'Escapes', 'Europe', 'FashionandStyle', 'Golf', 'Health', 'HealthCarePolicy', 'Hockey', 'HomePage', 'HomeandGarden', 'International', 'JobMarket', 'Magazine', 'MediaandAdvertising', 'MetroCampaigns', 'MiddleEast', 'MovieNews', 'Movies', 'Multimedia', 'Music', 'NYRegion', 'National', 'Nutrition', 'Obituaries', 'Opinion', 'OtherSports', 'ProBasketball', 'ProFootball', 'Psychology', 'RealEstate', 'RedCarpet', 'Science', 'Soccer', 'Space', 'Sports', 'SundayBookReview', 'SundayStyles', 'Technology', 'Television', 'TheCity', 'Theater', 'ThursdayStyles', 'TimesSelect', 'Travel', 'Washington', 'Weddings', 'WeekinReview', 'WorldBusiness', 'YourMoney', 'pop_top']
# removed: 'Campaigns', 'Trail', PoguesPosts
items = []
for feed in feeds:
    c = urllib.urlopen("http://www.nytimes.com/services/xml/rss/userland/%s.xml" % feed)
    c =  minidom.parseString(c.read().replace('\x92', "'")).getElementsByTagName("channel")[0]
    items += [[text(node, "link"), text(node, "title"),
               text(node, "author").title().replace('By', 'by'), feed,
               text(node, "description"), time.time()]
              for node in c.getElementsByTagName("item")]

fitems = pickle.load(open("db.pkl"))
itemurls = [x[0] for x in fitems]
newitemurls = []

for item in items:
    if item[0] not in (itemurls + newitemurls):
        newitemurls.append(item[0])
        fitems = [item] + fitems

def sorter(x):
    d = re.findall('http://www.nytimes.com/(\d\d\d\d/\d\d/\d\d)/', x[0])
    if d: return d[0] + `x[5]`
    return time.strftime("%Y/%m/%d", time.localtime(x[5])) + `x[5]`

fitems.sort(lambda y,x: cmp(sorter(x),sorter(y)))
items = fitems[:1000]

newnewitemurls = []
oldnewitemurls = []
itemurls = [x[0] for x in items]
for url in newitemurls:
    if url in itemurls:
        newnewitemurls.append(url)
    else:
        oldnewitemurls.append(url)
        
pickle.dump(items, open("nytimes.pkl", "w"))
import dbm; from nytutils import cleanLink
db = dbm.open('itemurls.dbm', 'w')
for item in itemurls:
	if not db.has_key(cleanLink(item)): db[cleanLink(item)] = item
open('urlarchive.txt', 'a').write('\n'.join(newnewitemurls)+'\n')
if oldnewitemurls: open('olditemurls', 'a').write('\n'.join(oldnewitemurls)+'\n')

print '<html><head><title>New York Times: Latest Headlines</title><link rel="stylesheet" href="style.css" /></head><body><h1>New York Times: Latest Headlines</h1><p class="description">Webloggers: Check out our new <a href="genlink">link generator</a>.</p>'
for item in items:
    #item[5] = time.strftime("%b %e %l:%M:%S %p", time.strptime(item[5], "%Y%m%d%H%M%S"))
    item[5] = time.strftime("%l%p %e %b", time.localtime(item[5]))
    print ('<p><a href="%s">%s</a> %s (%s)<br />%s %s</p>' % tuple(item)).encode('utf8')
print '<address>Site by <a href="http://www.aaronsw.com/">Aaron Swartz</a> (<a href="source.py">source code</a>); content by <a href="http://www.nytimes.com/">the New York Times</a>.</address><!--#include virtual="nytimes-ads.inc"--></body></html>'
