September 17, 2007 – 18:18
Recently some feeds grabbed by chytach (and parsed by feedparser.py) started to traceback:
Failed parsing http://del.icio.us/tag/gtd with exceptions.UnicodeDecodeError: {'object': '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 'end': 6, 'encoding': 'ascii', 'args': ('ascii', '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 5, 6, 'ordinal not in range(128)'), 'start': 5, 'reason': 'ordinal not in range(128)'}
Traceback (most recent call last):
File "/home/akhavr/chytach/bin/feedupdate.py", line 97, in processfeed
etag=feed.etag, modified=modified)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 2626, in parse
feedparser.feed(data)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed
sgmllib.SGMLParser.feed(self, data)
File "/usr/lib/python2.4/sgmllib.py", line 95, in feed
self.goahead(0)
File "/usr/lib/python2.4/sgmllib.py", line 134, in goahead
k = self.parseendtag(i)
File "/usr/lib/python2.4/sgmllib.py", line 296, in parseendtag
self.finishendtag(tag)
File "/usr/lib/python2.4/sgmllib.py", line 336, in finishendtag
self.unknownendtag(tag)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 476, in unknownendtag
method()
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1318, in endcontent
value = self.popContent('content')
File "/usr/lib/python2.4/site-packages/feedparser.py", line 700, in popContent
value = self.pop(tag)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 641, in pop
output = resolveRelativeURIs(output, self.baseuri, self.encoding)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1594, in _resolveRelativeURIs
p.feed(htmlSource)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed
sgmllib.SGMLParser.feed(self, data)
File "/usr/lib/python2.4/sgmllib.py", line 95, in feed
self.goahead(0)
File "/usr/lib/python2.4/sgmllib.py", line 129, in goahead
k = self.parsestarttag(i)
File "/usr/lib/python2.4/sgmllib.py", line 283, in parsestarttag
self.finishstarttag(tag, attrs)
File "/usr/lib/python2.4/sgmllib.py", line 314, in finishstarttag
self.unknownstarttag(tag, attrs)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1588, in unknownstarttag
attrs = [(key, ((tag, key) in self.relativeuris) and self.resolveURI(value) or value) for key, value in attrs]
File "/usr/lib/python2.4/site-packages/feedparser.py", line 1584, in resolveURI
return urljoin(self.baseuri, uri)
File "/usr/lib/python2.4/site-packages/feedparser.py", line 286, in _urljoin
return urlparse.urljoin(base, uri)
File "/usr/lib/python2.4/urlparse.py", line 158, in urljoin
return urlunparse((scheme, netloc, path,
File "/usr/lib/python2.4/urlparse.py", line 125, in urlunparse
return urlunsplit((scheme, netloc, url, query, fragment))
File "/usr/lib/python2.4/urlparse.py", line 130, in urlunsplit
url = '//' + (netloc or '') + url
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe6 in position 5: ordinal not in range(128)
Investigation revealed that
feedparser fails in _urljoin when passed uri contains non-ascii characters. Thanks to former
django unicode experience, the patch was easy:
def smartstr(s, encoding='utf-8', errors='strict'):
if isinstance(s, unicode):
return s.encode(encoding, errors)
elif s and encoding != 'utf-8':
return s.decode('utf-8', errors).encode(encoding, errors)
else:
return s_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)')
def urljoin(base, uri):
uri = urllib.quote(smartstr(uri), safe='/#%[]=:;$&()+,!?')
uri = _urifixer.sub(r'\1\3', uri)
return urlparse.urljoin(base, uri)
Tested and works.
Posted in Python | No Comments »