#!/usr/bin/env python
# A hard-coded, naive HTML to Atom converter for sil2100.vexillium.org
# What could/should be done is rewriting this code to make it more generic,
# allowing the user to input what webpage should be served etc. But for now,
# everything is hard-coded.
# Atom entries are defined by
.
# Copyright (C) 2011 Lukasz 'sil2100' Zemczak
import sys
from HTMLParser import HTMLParser
class AtomFeed(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
# We use all these flags for our convinience
self.proc_entry = False
self.proc_title = False
self.proc_date = False
self.proc_time = False
self.proc_intro = False
self.div_count = 0
self.proc_div = -1
self.atom_title = ''
self.atom_date = ''
self.atom_time = ''
self.atom_intro = ''
self.atom_id = ''
self.entries = ''
self.latest_date = None
self.latest_time = None
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == 'div':
# Count the number of div's we enter, so we know when the 'entry' div ends
self.div_count += 1
if self.proc_entry == False and tag == 'div' and (attrs['class'] == 'entry'):
self.proc_entry = True
self.proc_div = self.div_count
elif self.proc_entry == True:
if tag == 'h4':
self.proc_title = True
elif tag == 'a' and 'name' in attrs:
self.atom_id = str(attrs['name']);
elif tag == 'div':
if (attrs['class'] == 'date'):
self.proc_date = True
elif (attrs['class'] == 'time'):
self.proc_time = True
elif (attrs['class'] == 'intro'):
self.proc_intro = True
def handle_data(self, data):
# Copy the data, if we're in a reading state
if self.proc_entry == True:
if self.proc_title == True:
self.atom_title += str(data)
elif self.proc_date == True:
self.atom_date += str(data)
elif self.proc_time == True:
self.atom_time += str(data)
elif self.proc_intro == True:
self.atom_intro += str(data)
def handle_endtag(self, tag):
if self.proc_entry == True and tag == 'div' and self.div_count == self.proc_div:
self.proc_entry = False
self.add_entry()
self.proc_div = -1
self.atom_title = ''
self.atom_date = ''
self.atom_time = ''
self.atom_intro = ''
elif self.proc_entry == True:
if tag == 'h4':
self.proc_title = False
elif tag == 'div':
if self.proc_date == True:
self.proc_date = False
elif self.proc_time == True:
self.proc_time = False
elif self.proc_intro == True:
self.proc_intro = False
if tag == 'div':
self.div_count -= 1
def start_feed(self):
# A hard-coded header, modify if needed
print ''
print ''
print ''
print ''
print ' sil2100//vx web-page - Development'
print ' http://sil2100.vexillium.org:41/'
print ' '
print ' '
print ' ' + self.latest_date + 'T' + self.latest_time + ':00Z'
print ' '
print ' Lukasz Zemczak'
print ' sil2100@vexillium.org'
print ' '
print ''
def add_entry(self):
if len(self.atom_time) == 0:
self.atom_time = '00:01'
if self.latest_date is None:
self.latest_date = self.atom_date
self.latest_time = self.atom_time
self.entries += ' \n'
self.entries += ' tag:sil2100.vexillium.org,' + self.atom_date + ':/?id=dev/' + self.atom_id + '\n'
self.entries += ' ' + self.atom_title + '\n'
self.entries += ' \n'
self.entries += ' ' + self.atom_date + 'T' + self.atom_time + ':00Z\n'
self.entries += ' ' + self.atom_intro + '\n'
self.entries += ' \n'
self.entries += '\n'
def end_feed(self):
print ''
def generate_feed(self):
# We print out the Atom feed generated
self.start_feed()
print self.entries
self.end_feed()
def main():
if len(sys.argv) < 2:
print 'Usage: atom_script.py page.html'
return
f = open(sys.argv[1], 'r')
feed = AtomFeed()
feed.feed(f.read())
feed.generate_feed()
if __name__ == '__main__': main()