feedparser patch for UnicodeEncodeError
September 17, 2007 – 18:18Recently some feeds grabbed by chytach (and parsed by feedparser.py) started to traceback:
Investigation revealed that feedparser fails in _urljoin when passed uri contains non-ascii characters. Thanks to former django unicode experience, the patch was easy:Failed parsing http://del.icio.us/tag/gtd with exceptions.UnicodeDecodeError: {'object': '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 'end': 6, 'encoding': 'ascii', 'args': ('ascii', '/tag/\xe6\x97\xb6\xe9\x97\xb4\xe7\xae\xa1\xe7\x90\x86', 5, 6, 'ordinal not in range(128)'), 'start': 5, 'reason': 'ordinal not in range(128)'} Traceback (most recent call last): File "/home/akhavr/chytach/bin/feedupdate.py", line 97, in processfeed etag=feed.etag, modified=modified) File "/usr/lib/python2.4/site-packages/feedparser.py", line 2626, in parse feedparser.feed(data) File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed sgmllib.SGMLParser.feed(self, data) File "/usr/lib/python2.4/sgmllib.py", line 95, in feed self.goahead(0) File "/usr/lib/python2.4/sgmllib.py", line 134, in goahead k = self.parseendtag(i) File "/usr/lib/python2.4/sgmllib.py", line 296, in parseendtag self.finishendtag(tag) File "/usr/lib/python2.4/sgmllib.py", line 336, in finishendtag self.unknownendtag(tag) File "/usr/lib/python2.4/site-packages/feedparser.py", line 476, in unknownendtag method() File "/usr/lib/python2.4/site-packages/feedparser.py", line 1318, in endcontent value = self.popContent('content') File "/usr/lib/python2.4/site-packages/feedparser.py", line 700, in popContent value = self.pop(tag) File "/usr/lib/python2.4/site-packages/feedparser.py", line 641, in pop output = resolveRelativeURIs(output, self.baseuri, self.encoding) File "/usr/lib/python2.4/site-packages/feedparser.py", line 1594, in _resolveRelativeURIs p.feed(htmlSource) File "/usr/lib/python2.4/site-packages/feedparser.py", line 1441, in feed sgmllib.SGMLParser.feed(self, data) File "/usr/lib/python2.4/sgmllib.py", line 95, in feed self.goahead(0) File "/usr/lib/python2.4/sgmllib.py", line 129, in goahead k = self.parsestarttag(i) File "/usr/lib/python2.4/sgmllib.py", line 283, in parsestarttag self.finishstarttag(tag, attrs) File "/usr/lib/python2.4/sgmllib.py", line 314, in finishstarttag self.unknownstarttag(tag, attrs) File "/usr/lib/python2.4/site-packages/feedparser.py", line 1588, in unknownstarttag attrs = [(key, ((tag, key) in self.relativeuris) and self.resolveURI(value) or value) for key, value in attrs] File "/usr/lib/python2.4/site-packages/feedparser.py", line 1584, in resolveURI return urljoin(self.baseuri, uri) File "/usr/lib/python2.4/site-packages/feedparser.py", line 286, in _urljoin return urlparse.urljoin(base, uri) File "/usr/lib/python2.4/urlparse.py", line 158, in urljoin return urlunparse((scheme, netloc, path, File "/usr/lib/python2.4/urlparse.py", line 125, in urlunparse return urlunsplit((scheme, netloc, url, query, fragment)) File "/usr/lib/python2.4/urlparse.py", line 130, in urlunsplit url = '//' + (netloc or '') + url UnicodeDecodeError: 'ascii' codec can't decode byte 0xe6 in position 5: ordinal not in range(128)
Tested and works.def smartstr(s, encoding='utf-8', errors='strict'): if isinstance(s, unicode): return s.encode(encoding, errors) elif s and encoding != 'utf-8': return s.decode('utf-8', errors).encode(encoding, errors) else: return s_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)')def urljoin(base, uri): uri = urllib.quote(smartstr(uri), safe='/#%[]=:;$&()+,!?') uri = _urifixer.sub(r'\1\3', uri) return urlparse.urljoin(base, uri)