feedparser patch for UnicodeEncodeError

September 17, 2007 – 18:18

Recently some feeds grabbed by chytach (and parsed by feedparser.py) started to traceback:

Failed parsing http://del.icio.us/tag/gtd with exceptions.UnicodeDecodeError: {'object': '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 'end': 6, 'encoding': 'ascii', 'args': ('ascii', '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 5, 6, 'ordinal not in range(128)'), 'start': 5, 'reason': 'ordinal not in range(128)'}
Traceback (most recent call last):
  File "/home/akhavr/chytach/bin/feedupdate.py", line 97, in processfeed
    etag=feed.etag, modified=modified)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 2626, in parse
    feedparser.feed(data)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed
    sgmllib.SGMLParser.feed(self, data)
  File "/usr/lib/python2.4/sgmllib.py", line 95, in feed
    self.goahead(0)
  File "/usr/lib/python2.4/sgmllib.py", line 134, in goahead
    k = self.parseendtag(i)
  File "/usr/lib/python2.4/sgmllib.py", line 296, in parseendtag
    self.finishendtag(tag)
  File "/usr/lib/python2.4/sgmllib.py", line 336, in finishendtag
    self.unknownendtag(tag)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 476, in unknownendtag
    method()
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1318, in endcontent
    value = self.popContent('content')
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 700, in popContent
    value = self.pop(tag)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 641, in pop
    output = resolveRelativeURIs(output, self.baseuri, self.encoding)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1594, in _resolveRelativeURIs
    p.feed(htmlSource)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed
    sgmllib.SGMLParser.feed(self, data)
  File "/usr/lib/python2.4/sgmllib.py", line 95, in feed
    self.goahead(0)
  File "/usr/lib/python2.4/sgmllib.py", line 129, in goahead
    k = self.parsestarttag(i)
  File "/usr/lib/python2.4/sgmllib.py", line 283, in parsestarttag
    self.finishstarttag(tag, attrs)
  File "/usr/lib/python2.4/sgmllib.py", line 314, in finishstarttag
    self.unknownstarttag(tag, attrs)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1588, in unknownstarttag
    attrs = [(key, ((tag, key) in self.relativeuris) and self.resolveURI(value) or value) for key, value in attrs]
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 1584, in resolveURI
    return urljoin(self.baseuri, uri)
  File "/usr/lib/python2.4/site-packages/feedparser.py", line 286, in _urljoin
    return urlparse.urljoin(base, uri)
  File "/usr/lib/python2.4/urlparse.py", line 158, in urljoin
    return urlunparse((scheme, netloc, path,
  File "/usr/lib/python2.4/urlparse.py", line 125, in urlunparse
    return urlunsplit((scheme, netloc, url, query, fragment))
  File "/usr/lib/python2.4/urlparse.py", line 130, in urlunsplit
    url = '//' + (netloc or '') + url
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe6 in position 5: ordinal not in range(128)
Investigation revealed that feedparser fails in _urljoin when passed uri contains non-ascii characters. Thanks to former django unicode experience, the patch was easy:
def smartstr(s, encoding='utf-8', errors='strict'):
    if isinstance(s, unicode):
        return s.encode(encoding, errors)
    elif s and encoding != 'utf-8':
        return s.decode('utf-8', errors).encode(encoding, errors)
    else:
        return s_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)')

def urljoin(base, uri): uri = urllib.quote(smartstr(uri), safe='/#%[]=:;$&()+,!?') uri = _urifixer.sub(r'\1\3', uri) return urlparse.urljoin(base, uri)

Tested and works.