#!/usr/bin/env python # A hard-coded, naive HTML to Atom converter for sil2100.vexillium.org # What could/should be done is rewriting this code to make it more generic, # allowing the user to input what webpage should be served etc. But for now, # everything is hard-coded. # Atom entries are defined by
. # Copyright (C) 2011 Lukasz 'sil2100' Zemczak import sys from HTMLParser import HTMLParser class AtomFeed(HTMLParser): def __init__(self): HTMLParser.__init__(self) # We use all these flags for our convinience self.proc_entry = False self.proc_title = False self.proc_date = False self.proc_time = False self.proc_intro = False self.div_count = 0 self.proc_div = -1 self.atom_title = '' self.atom_date = '' self.atom_time = '' self.atom_intro = '' self.atom_id = '' self.entries = '' self.latest_date = None self.latest_time = None def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == 'div': # Count the number of div's we enter, so we know when the 'entry' div ends self.div_count += 1 if self.proc_entry == False and tag == 'div' and (attrs['class'] == 'entry'): self.proc_entry = True self.proc_div = self.div_count elif self.proc_entry == True: if tag == 'h4': self.proc_title = True elif tag == 'a' and 'name' in attrs: self.atom_id = str(attrs['name']); elif tag == 'div': if (attrs['class'] == 'date'): self.proc_date = True elif (attrs['class'] == 'time'): self.proc_time = True elif (attrs['class'] == 'intro'): self.proc_intro = True def handle_data(self, data): # Copy the data, if we're in a reading state if self.proc_entry == True: if self.proc_title == True: self.atom_title += str(data) elif self.proc_date == True: self.atom_date += str(data) elif self.proc_time == True: self.atom_time += str(data) elif self.proc_intro == True: self.atom_intro += str(data) def handle_endtag(self, tag): if self.proc_entry == True and tag == 'div' and self.div_count == self.proc_div: self.proc_entry = False self.add_entry() self.proc_div = -1 self.atom_title = '' self.atom_date = '' self.atom_time = '' self.atom_intro = '' elif self.proc_entry == True: if tag == 'h4': self.proc_title = False elif tag == 'div': if self.proc_date == True: self.proc_date = False elif self.proc_time == True: self.proc_time = False elif self.proc_intro == True: self.proc_intro = False if tag == 'div': self.div_count -= 1 def start_feed(self): # A hard-coded header, modify if needed print '' print '' print '' print '' print ' sil2100//vx web-page - Development' print ' http://sil2100.vexillium.org:41/' print ' ' print ' ' print ' ' + self.latest_date + 'T' + self.latest_time + ':00Z' print ' ' print ' Lukasz Zemczak' print ' sil2100@vexillium.org' print ' ' print '' def add_entry(self): if len(self.atom_time) == 0: self.atom_time = '00:01' if self.latest_date is None: self.latest_date = self.atom_date self.latest_time = self.atom_time self.entries += ' \n' self.entries += ' tag:sil2100.vexillium.org,' + self.atom_date + ':/?id=dev/' + self.atom_id + '\n' self.entries += ' ' + self.atom_title + '\n' self.entries += ' \n' self.entries += ' ' + self.atom_date + 'T' + self.atom_time + ':00Z\n' self.entries += ' ' + self.atom_intro + '\n' self.entries += ' \n' self.entries += '\n' def end_feed(self): print '' def generate_feed(self): # We print out the Atom feed generated self.start_feed() print self.entries self.end_feed() def main(): if len(sys.argv) < 2: print 'Usage: atom_script.py page.html' return f = open(sys.argv[1], 'r') feed = AtomFeed() feed.feed(f.read()) feed.generate_feed() if __name__ == '__main__': main()