Python regex on list

Posted by Peter Nielsen on Stack Overflow See other posts from Stack Overflow or by Peter Nielsen
Published on 2010-05-22T10:48:29Z Indexed on 2010/05/22 11:00 UTC
Read the original article Hit count: 265

Filed under:
|
|
|

Hi there

I am trying to build a parser and save the results as an xml file but i have problems.. For instance i get a TypeError: expected string or buffer when i try to run the code..

Would you experts please have a look at my code ?

import urllib2, re

from xml.dom.minidom import Document from BeautifulSoup import BeautifulSoup as bs

osc = open('OSCTEST.html','r')

oscread = osc.read()

soup=bs(oscread)

doc = Document()

root = doc.createElement('root')

doc.appendChild(root)

countries = doc.createElement('countries')

root.appendChild(countries)

findtags1 = re.compile ('<h1 class="title metadata_title content_perceived_text(.*?)</h1>', re.DOTALL | re.IGNORECASE).findall(soup)

findtags2 = re.compile ('<span class="content_text">(.*?)</span>', re.DOTALL | re.IGNORECASE).findall(soup)

for header in findtags1:

title_elem = doc.createElement('title')
countries.appendChild(title_elem)
header_elem = doc.createTextNode(header)
title_elem.appendChild(header_elem)

for item in findtags2:

    art_elem = doc.createElement('artikel')
    countries.appendChild(art_elem)
    s = item.replace('<P>','')
    t = s.replace('</P>','')
    text_elem = doc.createTextNode(t)
    art_elem.appendChild(text_elem)    

print doc.toprettyxml()

© Stack Overflow or respective owner

Related posts about python

Related posts about Xml