[Commits] (pavlov) updating to feedparser 3.0 beta 22
commits at osafoundation.org
commits at osafoundation.org
Tue Apr 20 14:12:36 PDT 2004
Commit by: pavlov
Modified files:
osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py 1.13 1.14
osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py 1.1 1.2
osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py 1.3 1.4
Log message:
updating to feedparser 3.0 beta 22
ViewCVS links:
http://cvs.osafoundation.org/index.cgi/osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py.diff?r1=text&tr1=1.13&r2=text&tr2=1.14
http://cvs.osafoundation.org/index.cgi/osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py.diff?r1=text&tr1=1.1&r2=text&tr2=1.2
http://cvs.osafoundation.org/index.cgi/osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py.diff?r1=text&tr1=1.3&r2=text&tr2=1.4
Index: osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py
diff -u osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py:1.1 osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py:1.2
--- osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py:1.1 Mon Apr 19 14:07:56 2004
+++ osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/ZaoBaoTask.py Tue Apr 20 14:12:05 2004
@@ -1,5 +1,5 @@
-__revision__ = "$Revision: 1.1 $"
-__date__ = "$Date: 2004/04/19 21:07:56 $"
+__revision__ = "$Revision: 1.2 $"
+__date__ = "$Date: 2004/04/20 21:12:05 $"
__copyright__ = "Copyright (c) 2004 Open Source Applications Foundation"
__license__ = "http://osafoundation.org/Chandler_0.1_license_terms.htm"
@@ -38,6 +38,11 @@
#print 'failed to parse %s' % item.url
#print e
logging.exception('zaobao failed to parse %s' % item.url)
+ except UnicodeEncodeError, e:
+ #print 'failed to parse %s' % item.url
+ #print e
+ logging.exception('zaobao failed to parse %s' % item.url)
+
repository.commit()
#print 'Updated feeds'
Index: osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py
diff -u osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py:1.13 osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py:1.14
--- osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py:1.13 Mon Apr 19 14:07:56 2004
+++ osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/RSSData.py Tue Apr 20 14:12:05 2004
@@ -1,5 +1,5 @@
-__revision__ = "$Revision: 1.13 $"
-__date__ = "$Date: 2004/04/19 21:07:56 $"
+__revision__ = "$Revision: 1.14 $"
+__date__ = "$Date: 2004/04/20 21:12:05 $"
__copyright__ = "Copyright (c) 2003 Open Source Applications Foundation"
__license__ = "http://osafoundation.org/Chandler_0.1_license_terms.htm"
@@ -132,11 +132,7 @@
lastModified = lastModified.tuple()
# fetch the data
-
- # XXX because of a bug in feedparser 3.0 betas we don't do
- # etags or lastmodified here (bug 1421)
- #data = feedparser.parse(self.url, etag, lastModified)
- data = feedparser.parse(self.url)
+ data = feedparser.parse(self.url, etag, lastModified)
# set etag
SetAttribute(self, data, 'etag')
@@ -147,8 +143,11 @@
self.lastModified = mx.DateTime.mktime(modified)
# if the feed is bad, raise the sax exception
- if data['bozo'] == 1:
- raise data['bozo_exception']
+ try:
+ if data['bozo'] == 1:
+ raise data['bozo_exception']
+ except KeyError:
+ return
self._DoChannel(data['channel'])
self._DoItems(data['items'])
Index: osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py
diff -u osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py:1.3 osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py:1.4
--- osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py:1.3 Mon Apr 19 14:07:56 2004
+++ osaf/chandler/Chandler/parcels/OSAF/examples/zaobao/feedparser.py Tue Apr 20 14:12:05 2004
@@ -3,14 +3,14 @@
Visit http://diveintomark.org/projects/feed_parser/ for the latest version
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, Hot RSS, Atom, CDF feeds
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
Required: Python 2.1 or later
Recommended: Python 2.3 or later
Recommended: libxml2 <http://xmlsoft.org/python.html>
"""
-__version__ = "3.0-beta-21"
+__version__ = "3.0-beta-22"
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
__copyright__ = "Copyright 2002-4, Mark Pilgrim"
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
@@ -18,15 +18,20 @@
"Fazal Majid <http://www.majid.info/mylos/weblog/>"]
__license__ = "Python"
_debug = 0
+_debug_never_use_libxml2 = 0
# if you are embedding feedparser in a larger application, you should change this to your application name and URL
USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
-# ---------- required modules (should come with any Python distribution) ----------
-#import cjkcodecs.aliases
-#import japanese
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1.
+# This is off by default because of reports of crashing on some platforms. If it crashes
+# for you, please submit a bug report with your OS platform, Python version, and the URL
+# of the feed you were attempting to parse.
+# Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+TIDY_MARKUP = 0
-import sgmllib, re, sys, copy, urlparse, time, rfc822
+# ---------- required modules (should come with any Python distribution) ----------
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types
try:
from cStringIO import StringIO as _StringIO
except:
@@ -53,12 +58,12 @@
socket.setdefaulttimeout(10)
import urllib2
-# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
-# this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class
-#try:
-# from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html
-#except:
-# _mxtidy = None
+_mxtidy = None
+if TIDY_MARKUP:
+ try:
+ from mx.Tidy import Tidy as _mxtidy
+ except:
+ pass
# If a real XML parser is available, feedparser will attempt to use it. feedparser works
# with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python
@@ -72,6 +77,7 @@
try:
import xml.sax
from xml.sax.saxutils import escape as _xmlescape
+ class CharacterEncodingOverride(xml.sax.SAXException): pass
_XML_AVAILABLE = 1
except:
_XML_AVAILABLE = 0
@@ -106,7 +112,7 @@
'atom03': 'Atom 0.3',
'atom': 'Atom (unknown version)',
'cdf': 'CDF',
- 'hotrss': 'Hot RSS',
+ 'hotrss': 'Hot RSS'
}
try:
@@ -119,6 +125,23 @@
rc[k] = v
return rc
+from UserDict import UserDict
+class FeedParserDict(UserDict):
+ def __getitem__(self, key):
+ if key == 'channel': key = 'feed'
+ if key == 'items': key = 'entries'
+ return UserDict.__getitem__(self, key)
+
+ def __getattr__(self, key):
+ try:
+ return self.__dict__[key]
+ except KeyError:
+ pass
+ try:
+ return self.__getitem__(key)
+ except:
+ raise AttributeError, "object has no attribute '%s'" % key
+
class _FeedParserMixin:
namespaces = {"": "",
"http://backend.userland.com/rss": "",
@@ -180,23 +203,24 @@
can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
html_types = ['text/html', 'application/xhtml+xml']
- def __init__(self, baseuri=None):
+ def __init__(self, baseuri=None, encoding='utf-8'):
if _debug: sys.stderr.write("initializing FeedParser\n")
- self.channel = {} # channel- or feed-level data
- self.items = [] # list of item- or entry-level data
+ self.feeddata = FeedParserDict() # feed-level data
+ self.encoding = encoding # character encoding
+ self.entries = [] # list of entry-level data
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
# the following are used internally to track state;
# some of this is kind of out of control and should
# probably be refactored into a finite state machine
- self.inchannel = 0
- self.initem = 0
+ self.infeed = 0
+ self.inentry = 0
self.incontent = 0
self.intextinput = 0
self.inimage = 0
self.inauthor = 0
self.incontributor = 0
- self.contentparams = {}
+ self.contentparams = FeedParserDict()
self.namespacemap = {}
self.elementstack = []
self.basestack = []
@@ -334,35 +358,10 @@
def handle_decl(self, text):
pass
-# # called for the DOCTYPE, if present, e.g.
-# # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
-# # "http://www.w3.org/TR/html4/loose.dtd">
-# if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'):
-# self.version = 'rss091n'
-
- _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
- def _scan_name(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- if i == n:
- return None, -1
- m = self._new_declname_match(rawdata, i)
- if m:
- s = m.group()
- name = s.strip()
- if (i + len(s)) == n:
- return None, -1 # end of buffer
- return name.lower(), m.end()
- else:
- self.updatepos(declstartpos, i)
- self.error("expected name token")
def parse_declaration(self, i):
# override internal declaration handler to handle CDATA blocks
if _debug: sys.stderr.write("entering parse_declaration\n")
-# if re.search(r'^<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', self.rawdata[i:]):
-# if _debug: sys.stderr.write("found Netscape DOCTYPE\n")
-# self.version = 'rss091n'
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1: k = len(self.rawdata)
@@ -429,50 +428,56 @@
# resolve relative URIs within embedded markup
if element in self.can_contain_relative_uris:
- output = _resolveRelativeURIs(output, self.baseuri)
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
# sanitize embedded markup
if element in self.can_contain_dangerous_markup:
- output = _sanitizeHTML(output)
+ output = _sanitizeHTML(output, self.encoding)
+
+ if type(output) == types.StringType:
+ try:
+ output = unicode(output, self.encoding)
+ except:
+ pass
# store output in appropriate place(s)
- if self.initem:
+ if self.inentry:
if element == 'content':
- self.items[-1].setdefault(element, [])
+ self.entries[-1].setdefault(element, [])
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.items[-1][element].append(contentparams)
+ self.entries[-1][element].append(contentparams)
elif element == 'category':
- self.items[-1][element] = output
- domain = self.items[-1]['categories'][-1][0]
- self.items[-1]['categories'][-1] = (domain, output)
+ self.entries[-1][element] = output
+ domain = self.entries[-1]['categories'][-1][0]
+ self.entries[-1]['categories'][-1] = (domain, output)
elif element == 'source':
- self.items[-1]['source']['value'] = output
+ self.entries[-1]['source']['value'] = output
elif element == 'link':
- self.items[-1][element] = output
+ self.entries[-1][element] = output
if output:
- self.items[-1]['links'][-1]['href'] = output
+ self.entries[-1]['links'][-1]['href'] = output
else:
- self.items[-1][element] = output
+ self.entries[-1][element] = output
if self.incontent:
if element == 'description':
element = 'summary'
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.items[-1][element + '_detail'] = contentparams
- elif self.inchannel and (not self.intextinput) and (not self.inimage):
- self.channel[element] = output
+ self.entries[-1][element + '_detail'] = contentparams
+ elif self.infeed and (not self.intextinput) and (not self.inimage):
+ self.feeddata[element] = output
if element == 'category':
- domain = self.channel['categories'][-1][0]
- self.channel['categories'][-1] = (domain, output)
+ domain = self.feeddata['categories'][-1][0]
+ self.feeddata['categories'][-1] = (domain, output)
elif element == 'link':
- self.channel['links'][-1]['href'] = output
+ self.feeddata['links'][-1]['href'] = output
elif self.incontent:
if element == 'description':
element = 'tagline'
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.channel[element + '_detail'] = contentparams
+ self.feeddata[element + '_detail'] = contentparams
return output
def _mapToStandardPrefix(self, name):
@@ -489,10 +494,10 @@
def _save(self, key, value):
if value:
- if self.initem:
- self.items[-1].setdefault(key, value)
- elif self.channel:
- self.channel.setdefault(key, value)
+ if self.inentry:
+ self.entries[-1].setdefault(key, value)
+ elif self.feeddata:
+ self.feeddata.setdefault(key, value)
def _start_rss(self, attrsD):
versionmap = {'0.91': 'rss091u',
@@ -513,7 +518,7 @@
self.version = 'hotrss'
def _start_channel(self, attrsD):
- self.inchannel = 1
+ self.infeed = 1
self._cdf_common(attrsD)
_start_feedinfo = _start_channel
@@ -529,7 +534,7 @@
self._end_link()
def _start_feed(self, attrsD):
- self.inchannel = 1
+ self.infeed = 1
versionmap = {'0.1': 'atom01',
'0.2': 'atom02',
'0.3': 'atom03'}
@@ -542,7 +547,7 @@
self.version = 'atom'
def _end_channel(self):
- self.inchannel = 0
+ self.infeed = 0
_end_feed = _end_channel
def _start_image(self, attrsD):
@@ -555,7 +560,7 @@
self.intextinput = 1
self.push('textinput', 0)
context = self._getContext()
- context.setdefault('textinput', {})
+ context.setdefault('textinput', FeedParserDict())
_start_textInput = _start_textinput
def _end_textinput(self):
@@ -582,7 +587,7 @@
self.incontributor = 1
context = self._getContext()
context.setdefault('contributors', [])
- context['contributors'].append({})
+ context['contributors'].append(FeedParserDict())
self.push('contributor', 0)
def _end_contributor(self):
@@ -634,21 +639,21 @@
pass
def _getContext(self):
- if self.initem:
- context = self.items[-1]
+ if self.inentry:
+ context = self.entries[-1]
else:
- context = self.channel
+ context = self.feeddata
return context
def _save_author(self, key, value):
context = self._getContext()
- context.setdefault('author_detail', {})
+ context.setdefault('author_detail', FeedParserDict())
context['author_detail'][key] = value
self._sync_author_detail()
def _save_contributor(self, key, value):
context = self._getContext()
- context.setdefault('contributors', [{}])
+ context.setdefault('contributors', [FeedParserDict()])
context['contributors'][-1][key] = value
def _sync_author_detail(self):
@@ -678,16 +683,16 @@
if author and (author[-1] == ')'):
author = author[:-1]
author = author.strip()
- context.setdefault('author_detail', {})
+ context.setdefault('author_detail', FeedParserDict())
context['author_detail']['name'] = author
context['author_detail']['email'] = email
def _start_tagline(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('tagline', 1)
_start_subtitle = _start_tagline
@@ -695,16 +700,16 @@
value = self.pop('tagline')
self.incontent -= 1
self.contentparams.clear()
- if self.inchannel:
- self.channel['description'] = value
+ if self.infeed:
+ self.feeddata['description'] = value
_end_subtitle = _end_tagline
def _start_copyright(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('copyright', 1)
_start_dc_rights = _start_copyright
@@ -715,16 +720,16 @@
_end_dc_rights = _end_copyright
def _start_item(self, attrsD):
- self.items.append({})
+ self.entries.append(FeedParserDict())
self.push('item', 0)
- self.initem = 1
+ self.inentry = 1
self._cdf_common(attrsD)
_start_entry = _start_item
_start_product = _start_item
def _end_item(self):
self.pop('item')
- self.initem = 0
+ self.inentry = 0
_end_entry = _end_item
def _start_dc_language(self, attrsD):
@@ -801,10 +806,10 @@
self.push('category', 1)
domain = self._getAttribute(attrsD, 'domain')
cats = []
- if self.initem:
- cats = self.items[-1].setdefault('categories', [])
- elif self.inchannel:
- cats = self.channel.setdefault('categories', [])
+ if self.inentry:
+ cats = self.entries[-1].setdefault('categories', [])
+ elif self.infeed:
+ cats = self.feeddata.setdefault('categories', [])
cats.append((domain, None))
_start_dc_subject = _start_category
_start_keywords = _start_category
@@ -815,27 +820,27 @@
_end_keywords = _end_category
def _start_cloud(self, attrsD):
- self.channel['cloud'] = attrsD
+ self.feeddata['cloud'] = attrsD
def _start_link(self, attrsD):
attrsD.setdefault('rel', 'alternate')
attrsD.setdefault('type', 'text/html')
if attrsD.has_key('href'):
attrsD['href'] = self.resolveURI(attrsD['href'])
- expectingText = self.inchannel or self.initem
- if self.initem:
- self.items[-1].setdefault('links', [])
- self.items[-1]['links'].append(attrsD)
- elif self.inchannel:
- self.channel.setdefault('links', [])
- self.channel['links'].append(attrsD)
+ expectingText = self.infeed or self.inentry
+ if self.inentry:
+ self.entries[-1].setdefault('links', [])
+ self.entries[-1]['links'].append(attrsD)
+ elif self.infeed:
+ self.feeddata.setdefault('links', [])
+ self.feeddata['links'].append(attrsD)
if attrsD.has_key('href'):
expectingText = 0
if attrsD.get('type', '') in self.html_types:
- if self.initem:
- self.items[-1]['link'] = attrsD['href']
- elif self.inchannel:
- self.channel['link'] = attrsD['href']
+ if self.inentry:
+ self.entries[-1]['link'] = attrsD['href']
+ elif self.infeed:
+ self.feeddata['link'] = attrsD['href']
else:
self.push('link', expectingText)
_start_producturl = _start_link
@@ -868,11 +873,11 @@
def _start_title(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
- self.push('title', self.inchannel or self.initem)
+ 'base': attrsD.get('xml:base', self.baseuri)})
+ self.push('title', self.infeed or self.inentry)
_start_dc_title = _start_title
def _end_title(self):
@@ -886,11 +891,11 @@
def _start_description(self, attrsD, default_content_type='text/html'):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', default_content_type),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
- self.push('description', self.inchannel or self.initem)
+ 'base': attrsD.get('xml:base', self.baseuri)})
+ self.push('description', self.infeed or self.inentry)
def _start_abstract(self, attrsD):
return self._start_description(attrsD, 'text/plain')
@@ -902,18 +907,18 @@
context = self._getContext()
if self.intextinput:
context['textinput']['description'] = value
- elif self.initem:
+ elif self.inentry:
context['summary'] = value
- elif self.inchannel:
+ elif self.infeed:
context['tagline'] = value
_end_abstract = _end_description
def _start_info(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('info', 1)
def _end_info(self):
@@ -925,13 +930,13 @@
if attrsD:
if attrsD.has_key('url'):
attrsD['url'] = self.resolveURI(attrsD['url'])
- self.channel['generator_detail'] = attrsD
+ self.feeddata['generator_detail'] = attrsD
self.push('generator', 1)
def _end_generator(self):
value = self.pop('generator')
- if self.channel.has_key('generator_detail'):
- self.channel['generator_detail']['name'] = value
+ if self.feeddata.has_key('generator_detail'):
+ self.feeddata['generator_detail']['name'] = value
def _start_admin_generatoragent(self, attrsD):
self.push('generator', 1)
@@ -949,27 +954,27 @@
def _start_summary(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('summary', 1)
def _end_summary(self):
value = self.pop('summary')
- if self.items:
- self.items[-1]['description'] = value
+ if self.entries:
+ self.entries[-1]['description'] = value
self.incontent -= 1
self.contentparams.clear()
def _start_enclosure(self, attrsD):
- if self.initem:
- self.items[-1].setdefault('enclosures', [])
- self.items[-1]['enclosures'].append(attrsD)
+ if self.inentry:
+ self.entries[-1].setdefault('enclosures', [])
+ self.entries[-1]['enclosures'].append(attrsD)
def _start_source(self, attrsD):
- if self.initem:
- self.items[-1]['source'] = attrsD
+ if self.inentry:
+ self.entries[-1]['source'] = attrsD
self.push('source', 1)
def _end_source(self):
@@ -977,35 +982,35 @@
def _start_content(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'xml'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
def _start_prodlink(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'xml'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
'type': attrsD.get('type', 'text/html'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
def _start_body(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': 'xml',
+ self.contentparams = FeedParserDict({'mode': 'xml',
'type': 'application/xhtml+xml',
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
_start_xhtml_body = _start_body
def _start_content_encoded(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': 'escaped',
+ self.contentparams = FeedParserDict({'mode': 'escaped',
'type': 'text/html',
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
_start_fullitem = _start_content_encoded
@@ -1023,10 +1028,10 @@
if _XML_AVAILABLE:
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler):
- def __init__(self, baseuri):
+ def __init__(self, baseuri, encoding):
if _debug: sys.stderr.write('trying StrictFeedParser\n')
xml.sax.handler.ContentHandler.__init__(self)
- _FeedParserMixin.__init__(self, baseuri)
+ _FeedParserMixin.__init__(self, baseuri, encoding)
self.bozo = 0
self.exc = None
@@ -1062,8 +1067,7 @@
self.unknown_starttag(localname, attrsD.items())
def resolveEntity(self, publicId, systemId):
- pass
-# return _StringIO()
+ return _StringIO()
def characters(self, text):
self.handle_data(text)
@@ -1080,26 +1084,32 @@
def error(self, exc):
self.bozo = 1
self.exc = exc
+
def fatalError(self, exc):
self.error(exc)
raise exc
-class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser):
- def __init__(self, baseuri):
- sgmllib.SGMLParser.__init__(self)
- _FeedParserMixin.__init__(self, baseuri)
-
class _BaseHTMLProcessor(sgmllib.SGMLParser):
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param']
- def __init__(self):
+ def __init__(self, encoding):
+ self.encoding = encoding
sgmllib.SGMLParser.__init__(self)
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
+ def feed(self, data):
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
+ data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
+ data = data.replace(''', "'")
+ data = data.replace('"', '"')
+ if type(data) == types.UnicodeType:
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+
def normalize_attrs(self, attrs):
# utility method to be called by descendants
attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
@@ -1110,6 +1120,7 @@
# called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
if tag in self.elements_no_end_tag:
self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
@@ -1136,6 +1147,7 @@
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
self.pieces.append(text)
def handle_comment(self, text):
@@ -1155,9 +1167,37 @@
# Reconstruct original DOCTYPE
self.pieces.append("<!%(text)s>" % locals())
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata))
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
def output(self):
"""Return processed HTML as a single string"""
- return "".join(self.pieces)
+ if _debug:
+ for p in self.pieces:
+ sys.stderr.write(p)
+ sys.stderr.write('\n')
+ return "".join([str(p) for p in self.pieces])
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+ def __init__(self, baseuri, encoding):
+ sgmllib.SGMLParser.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, encoding)
class _RelativeURIResolver(_BaseHTMLProcessor):
relative_uris = [('a', 'href'),
@@ -1186,8 +1226,8 @@
('q', 'cite'),
('script', 'src')]
- def __init__(self, baseuri):
- _BaseHTMLProcessor.__init__(self)
+ def __init__(self, baseuri, encoding):
+ _BaseHTMLProcessor.__init__(self, encoding)
self.baseuri = baseuri
def resolveURI(self, uri):
@@ -1198,8 +1238,10 @@
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-def _resolveRelativeURIs(htmlSource, baseURI):
- p = _RelativeURIResolver(baseURI)
+def _resolveRelativeURIs(htmlSource, baseURI, encoding):
+ if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
+ p = _RelativeURIResolver(baseURI, encoding)
+ if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n')
p.feed(htmlSource)
return p.output()
@@ -1256,18 +1298,18 @@
if not self.unacceptablestack:
_BaseHTMLProcessor.handle_data(self, text)
-def _sanitizeHTML(htmlSource):
- p = _HTMLSanitizer()
+def _sanitizeHTML(htmlSource, encoding):
+ p = _HTMLSanitizer(encoding)
p.feed(htmlSource)
data = p.output()
-# if _mxtidy:
-# nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
-# if data.count('<body'):
-# data = data.split('<body', 1)[1]
-# if data.count('>'):
-# data = data.split('>', 1)[1]
-# if data.count('</body'):
-# data = data.split('</body', 1)[0]
+ if _mxtidy and TIDY_MARKUP:
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
data = data.strip().replace('\r\n', '\n')
return data
@@ -1636,10 +1678,15 @@
"""
if not content_type:
return '', ''
+ content_type = content_type.strip()
paramstr = content_type.split(';')[1:]
if not paramstr:
return content_type, ''
content_type = content_type.split(';', 1)[0].strip().lower()
+ if not paramstr[0]:
+ # declaration like "text/xml;" (note ending semicolon)
+ # dunno if this is malformed but it sure was hard to track down
+ return content_type, ''
import string
params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
charset = params.get('charset')
@@ -1680,11 +1727,13 @@
def _changeEncodingDeclaration(data, encoding):
"""Changes an XML data stream on the fly to specify a new encoding
- data is a raw string (not Unicode) that is presumed to be in %encoding already
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
encoding is a string recognized by encodings.aliases
"""
if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
+ #import cjkcodecs.aliases
+ #import japanese
data = unicode(data, encoding)
declmatch = re.compile(u'^<\?xml[^>]*?>')
newdecl = unicode("""<?xml version='1.0' encoding='%s'?>""" % encoding, encoding)
@@ -1712,7 +1761,7 @@
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
"""Parse a feed from a URL, file, stream, or string"""
- result = {}
+ result = FeedParserDict()
f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
data = f.read()
if hasattr(f, "headers"):
@@ -1736,17 +1785,28 @@
if hasattr(f, "headers"):
result["headers"] = f.headers.dict
f.close()
- result['channel'] = {}
- result['items'] = {}
+ if result.get("status", 0) == 304:
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['debug_message'] = "The feed has not changed since you last checked, so the server sent no data. This is a feature, not a bug!"
+ return result
result['encoding'], http_encoding, xml_encoding = _getCharacterEncoding(result.get("headers", {}), data)
result['version'], data = _stripDoctype(data)
baseuri = result.get('headers', {}).get('content-location', result.get('url'))
# try true XML parser first
- if _XML_AVAILABLE:
+ if not _XML_AVAILABLE:
+ if _debug: sys.stderr.write('no xml libraries available\n')
+ use_strict_parser = _XML_AVAILABLE
+ if use_strict_parser:
if _debug: sys.stderr.write('using xml library\n')
result['bozo'] = 0
- feedparser = _StrictFeedParser(baseuri)
- saxparser = xml.sax.make_parser(["drv_libxml2"])
+ feedparser = _StrictFeedParser(baseuri, result['encoding'])
+ if _debug and _debug_never_use_libxml2:
+ sys.stderr.write('not using libxml2 (even if available)\n')
+ additional_parsers = []
+ else:
+ additional_parsers = ["drv_libxml2"]
+ saxparser = xml.sax.make_parser(additional_parsers)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
@@ -1761,63 +1821,55 @@
# libxml2 driver does not support EntityResolver
if _debug: sys.stderr.write('using an xml library that does not support EntityResolver (not a big deal)\n')
encoding_set = (result['encoding'] == xml_encoding)
-# if not encoding_set:
-# if hasattr(xml.sax.handler, "property_encoding"):
-# try:
-# saxparser.setProperty(xml.sax.handler.property_encoding, result['encoding'])
-# encoding_set = 1
-# except xml.sax.SAXNotSupportedException:
-# pass
-# except xml.sax.SAXNotRecognizedException:
-# pass
- bozo_exception = None
if not encoding_set:
- try:
- data = _changeEncodingDeclaration(data, result['encoding'])
- encoding_set = 1
- except Exception, bozo_exception:
- pass
- if encoding_set:
- source = xml.sax.xmlreader.InputSource()
- source.setByteStream(_StringIO(data))
- if hasattr(saxparser, '_ns_stack'):
- # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
- # PyXML doesn't have this problem, and it doesn't have _ns_stack either
- saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
- try:
- saxparser.parse(source)
- except Exception, e:
- # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
- # which captures them. For some reason, some Unicode-related errors go
- # uncaught on some combination of platform, XML library, Python version,
- # and phase of the moon.
- feedparser.bozo = 1
- feedparser.bozo_exception = e
- if feedparser.bozo:
- # feed is not well-formed XML, fall back on regex-based parser
- if _debug: sys.stderr.write('xml parsing failed, using regexes. now you have two problems...\n')
- result['bozo'] = 1
- result['bozo_exception'] = feedparser.exc
- # munge short tags, e.g. <description/> becomes <description></description>
- data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
- feedparser = _LooseFeedParser(baseuri)
- feedparser.feed(data)
- else:
- if _debug: sys.stderr.write('character encoding is wrong, using regexes\n')
+ bozo_exception = None
+ proposed_encodings = [result['encoding'], xml_encoding, 'utf-8', 'iso-8859-1', 'windows-1252']
+ tried_encodings = []
+ for proposed_encoding in proposed_encodings:
+ if proposed_encodings in tried_encodings: continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _changeEncodingDeclaration(data, proposed_encoding)
+ except Exception, bozo_exception:
+ if _debug: sys.stderr.write('character encoding is wrong\n')
+ else:
+ if proposed_encoding != result['encoding']:
+ try:
+ raise CharacterEncodingOverride, "document declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding)
+ except CharacterEncodingOverride, bozo_exception:
+ result['bozo'] = 1
+ result['bozo_exception'] = bozo_exception
+ result['encoding'] = proposed_encoding
+ encoding_set = 1
+ break
+ if not encoding_set:
result['bozo'] = 1
result['bozo_exception'] = bozo_exception
- # munge short tags, e.g. <description/> becomes <description></description>
- data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
- feedparser = _LooseFeedParser(baseuri)
- feedparser.feed(data)
- else:
- if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
- # munge short tags, e.g. <description/> becomes <description></description>
- data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
- feedparser = _LooseFeedParser(baseuri)
+ use_strict_parser = 0
+ if use_strict_parser:
+ source = xml.sax.xmlreader.InputSource()
+ source.setByteStream(_StringIO(data))
+ if hasattr(saxparser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
+ try:
+ saxparser.parse(source)
+ except Exception, e:
+ if _debug: sys.stderr.write('xml parsing failed\n')
+ feedparser.bozo = 1
+ feedparser.bozo_exception = feedparser.exc or e
+ if feedparser.bozo:
+ # feed is not well-formed XML, fall back on regex-based parser
+ result['bozo'] = 1
+ result['bozo_exception'] = feedparser.bozo_exception
+ use_strict_parser = 0
+ if not use_strict_parser:
+ if _debug: sys.stderr.write('using regexes, now you have two problems\n')
+ feedparser = _LooseFeedParser(baseuri, result['encoding'])
feedparser.feed(data)
- result['channel'] = feedparser.channel
- result['items'] = feedparser.items
+ result['feed'] = feedparser.feeddata
+ result['entries'] = feedparser.entries
result['version'] = result['version'] or feedparser.version
return result
@@ -1916,31 +1968,62 @@
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
# blogspot.com sites); added _debug variable
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
-#3.0 - MAP - parse entire feed with real XML parser (if available); added several
-# new supported namespaces; fixed bug tracking naked markup in description;
-# added support for enclosure; added support for source; re-added support for
-# cloud which got dropped somehow; added support for expirationDate; fixed
-# xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
-# documents that don't define one explicitly and one for documents that define
-# an outer and an inner xml:base that goes out of scope before the end of the
-# document; fixed bug parsing multiple links at feed level; added feed type and
-# version detection, result["version"] will be one of SUPPORTED_VERSIONS.keys()
-# or empty string if unrecognized; added support for creativeCommons:license and
-# cc:license; added support for full Atom content model in title, tagline, info,
-# copyright, summary; fixed bug with gzip encoding (not always telling server
-# we support it when we do); support Atom-style author element in author_detail
+#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
+# added several new supported namespaces; fixed bug tracking naked markup in
+# description; added support for enclosure; added support for source; re-added
+# support for cloud which got dropped somehow; added support for expirationDate
+#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
+# xml:base URI, one for documents that don't define one explicitly and one for
+# documents that define an outer and an inner xml:base that goes out of scope
+# before the end of the document
+#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
+#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
+# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
+# added support for creativeCommons:license and cc:license; added support for
+# full Atom content model in title, tagline, info, copyright, summary; fixed bug
+# with gzip encoding (not always telling server we support it when we do)
+#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
# (dictionary of "name", "url", "email"); map author to author_detail if author
-# contains name + email address; better handling of empty HTML tags (br, hr, img,
-# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />);
-# fixed CDATA handling in non-wellformed feeds under Python 2.1; fixed bug
-# resolving relative links in wfw:commentRSS; fixed bug capturing author and
-# contributor URL; fixed bug resolving relative links in author and contributor
-# URL; fixed bug resolvin relative links in generator URL; added support for
-# recognizing RSS 1.0; passed Simon Fell's namespace tests, and included them
-# permanently in the test suite with his permission; determine character
-# encoding as per RFC 3023; always map description to summary_detail (Andrei);
-# use libxml2 (if available); fixed bug exploding author information when
-# author name was in parentheses; removed ultra-problematic mxTidy support;
-# patch to workaround crash in PyXML/expat when encountering invalid entities
-# (MarkMoraes); support for textinput/textInput; added CDF support; added
-# Hot RSS support
+# contains name + email address
+#3.0b8 - 1/28/2004 - MAP - added support for contributor
+#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
+# support for summary
+#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
+# xml.util.iso8601
+#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
+# dangerous markup; fiddled with decodeEntities (not right); liberalized
+# date parsing even further
+#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
+# added support to Atom 0.2 subtitle; added support for Atom content model
+# in copyright; better sanitizing of dangerous HTML elements with end tags
+# (script, frameset)
+#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
+# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
+#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
+# Python 2.1
+#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
+# fixed bug capturing author and contributor URL; fixed bug resolving relative
+# links in author and contributor URL; fixed bug resolvin relative links in
+# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
+# namespace tests, and included them permanently in the test suite with his
+# permission; fixed namespace handling under Python 2.1
+#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
+#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
+#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
+# use libxml2 (if available)
+#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
+# name was in parentheses; removed ultra-problematic mxTidy support; patch to
+# workaround crash in PyXML/expat when encountering invalid entities
+# (MarkMoraes); support for textinput/textInput
+#3.0b20 - 4/7/2004 - MAP - added CDF support
+#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
+#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
+# results dict; changed results dict to allow getting values with results.key
+# as well as results[key]; work around embedded illformed HTML with half
+# a DOCTYPE; work around malformed Content-Type header; if character encoding
+# is wrong, try several common ones before falling back to regexes (if this
+# works, bozo_exception is set to CharacterEncodingOverride); fixed character
+# encoding issues in BaseHTMLProcessor by tracking encoding and converting
+# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
+# convert each value in results to Unicode (if possible), even if using
+# regex-based parsing
More information about the Commits
mailing list