1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2 +++ b/trunk/quahog/plugins/Weather/README.txt Thu Oct 22 10:14:56 2009 -0400
3 @@ -0,0 +1,9 @@
4 +Plugin which allows users to query weather conditions from various
5 +websites. The weather command will try each supported weather site
6 +until it gets a valid response. One can also query a specific weather
7 +site using the appropriate command.
8 +
9 +Dependencies:
10 +- feedparser <http://www.feedparser.org/>
11 +- simplejson <http://undefined.org/python/#simplejson> (unless Python
12 + 2.6 is being used)
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/trunk/quahog/plugins/Weather/__init__.py Thu Oct 22 10:14:56 2009 -0400
1.3 @@ -0,0 +1,70 @@
1.4 +###
1.5 +# Copyright (c) 2005, James Vega
1.6 +# All rights reserved.
1.7 +#
1.8 +# Redistribution and use in source and binary forms, with or without
1.9 +# modification, are permitted provided that the following conditions are met:
1.10 +#
1.11 +# * Redistributions of source code must retain the above copyright notice,
1.12 +# this list of conditions, and the following disclaimer.
1.13 +# * Redistributions in binary form must reproduce the above copyright notice,
1.14 +# this list of conditions, and the following disclaimer in the
1.15 +# documentation and/or other materials provided with the distribution.
1.16 +# * Neither the name of the author of this software nor the name of
1.17 +# contributors to this software may be used to endorse or promote products
1.18 +# derived from this software without specific prior written consent.
1.19 +#
1.20 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1.21 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.22 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.23 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1.24 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1.25 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1.26 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1.27 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1.28 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1.29 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.30 +# POSSIBILITY OF SUCH DAMAGE.
1.31 +###
1.32 +
1.33 +"""
1.34 +This plugin does weather-related stuff. It can't change the weather, though,
1.35 +so don't get your hopes up. We just report it.
1.36 +"""
1.37 +
1.38 +import supybot
1.39 +import supybot.world as world
1.40 +
1.41 +# Use this for the version of this plugin. You may wish to put a CVS keyword
1.42 +# in here if you're keeping the plugin in CVS or some similar system.
1.43 +__version__ = "%%VERSION%%"
1.44 +
1.45 +__author__ = supybot.authors.unknown
1.46 +
1.47 +supybot.authors.mtughan = supybot.Author('Michael Tughan', 'mtughan', 'michaelsprogramming@gmail.com')
1.48 +
1.49 +# This is a dictionary mapping supybot.Author instances to lists of
1.50 +# contributions.
1.51 +__contributors__ = {
1.52 + supybot.authors.jamessan: ['cnn', 'wunder', 'wunder.rss',
1.53 + 'temperatureUnit configuration variable',
1.54 + 'convert configuration variable'],
1.55 + supybot.authors.jemfinch: ['weather'],
1.56 + supybot.authors.bwp: ['ham'],
1.57 + supybot.authors.mtughan: ['cnn', 'wunder', 'wunder.rss', 'ham'],
1.58 + }
1.59 +
1.60 +import config
1.61 +import plugin
1.62 +reload(plugin) # In case we're being reloaded.
1.63 +# Add more reloads here if you add third-party modules and want them to be
1.64 +# reloaded when this plugin is reloaded. Don't forget to import them as well!
1.65 +
1.66 +if world.testing:
1.67 + import test
1.68 +
1.69 +Class = plugin.Class
1.70 +configure = config.configure
1.71 +
1.72 +
1.73 +# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/trunk/quahog/plugins/Weather/config.py Thu Oct 22 10:14:56 2009 -0400
2.3 @@ -0,0 +1,73 @@
2.4 +###
2.5 +# Copyright (c) 2005, James Vega
2.6 +# All rights reserved.
2.7 +#
2.8 +# Redistribution and use in source and binary forms, with or without
2.9 +# modification, are permitted provided that the following conditions are met:
2.10 +#
2.11 +# * Redistributions of source code must retain the above copyright notice,
2.12 +# this list of conditions, and the following disclaimer.
2.13 +# * Redistributions in binary form must reproduce the above copyright notice,
2.14 +# this list of conditions, and the following disclaimer in the
2.15 +# documentation and/or other materials provided with the distribution.
2.16 +# * Neither the name of the author of this software nor the name of
2.17 +# contributors to this software may be used to endorse or promote products
2.18 +# derived from this software without specific prior written consent.
2.19 +#
2.20 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
2.21 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2.22 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2.23 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
2.24 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2.25 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2.26 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2.27 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2.28 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2.29 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2.30 +# POSSIBILITY OF SUCH DAMAGE.
2.31 +###
2.32 +
2.33 +import plugin
2.34 +
2.35 +import supybot.conf as conf
2.36 +import supybot.utils as utils
2.37 +import supybot.registry as registry
2.38 +
2.39 +def configure(advanced):
2.40 + # This will be called by supybot to configure this module. advanced is
2.41 + # a bool that specifies whether the user identified himself as an advanced
2.42 + # user or not. You should effect your configuration by manipulating the
2.43 + # registry as appropriate.
2.44 + from supybot.questions import expect, anything, something, yn
2.45 + conf.registerPlugin('Weather', True)
2.46 +
2.47 +class WeatherUnit(registry.String):
2.48 + def setValue(self, s):
2.49 + s = s.capitalize()
2.50 + if s not in plugin.unitAbbrevs:
2.51 + raise registry.InvalidRegistryValue,\
2.52 + 'Unit must be one of Fahrenheit, Celsius, or Kelvin.'
2.53 + s = plugin.unitAbbrevs[s]
2.54 + registry.String.setValue(self, s)
2.55 +
2.56 +class WeatherCommand(registry.OnlySomeStrings):
2.57 + validStrings = plugin.Weather.weatherCommands
2.58 +
2.59 +Weather = conf.registerPlugin('Weather')
2.60 +conf.registerChannelValue(Weather, 'temperatureUnit',
2.61 + WeatherUnit('Fahrenheit', """Sets the default temperature unit to use when
2.62 + reporting the weather."""))
2.63 +conf.registerChannelValue(Weather, 'command',
2.64 + WeatherCommand('wunder', """Sets the default command to use when retrieving
2.65 + the weather. Command must be one of %s.""" %
2.66 + utils.str.commaAndify(plugin.Weather.weatherCommands, And='or')))
2.67 +conf.registerChannelValue(Weather, 'convert',
2.68 + registry.Boolean(True, """Determines whether the weather commands will
2.69 + automatically convert weather units to the unit specified in
2.70 + supybot.plugins.Weather.temperatureUnit."""))
2.71 +
2.72 +conf.registerUserValue(conf.users.plugins.Weather, 'lastLocation',
2.73 + registry.String('', ''))
2.74 +
2.75 +
2.76 +# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/trunk/quahog/plugins/Weather/local/BeautifulSoup.py Thu Oct 22 10:14:56 2009 -0400
3.3 @@ -0,0 +1,1080 @@
3.4 +"""Beautiful Soup
3.5 +Elixir and Tonic
3.6 +"The Screen-Scraper's Friend"
3.7 +v2.1.1
3.8 +http://www.crummy.com/software/BeautifulSoup/
3.9 +
3.10 +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
3.11 +into a tree representation. It provides methods and Pythonic idioms
3.12 +that make it easy to search and modify the tree.
3.13 +
3.14 +A well-formed XML/HTML document will yield a well-formed data
3.15 +structure. An ill-formed XML/HTML document will yield a
3.16 +correspondingly ill-formed data structure. If your document is only
3.17 +locally well-formed, you can use this library to find and process the
3.18 +well-formed part of it. The BeautifulSoup class has heuristics for
3.19 +obtaining a sensible parse tree in the face of common HTML errors.
3.20 +
3.21 +Beautiful Soup has no external dependencies. It works with Python 2.2
3.22 +and up.
3.23 +
3.24 +Beautiful Soup defines classes for four different parsing strategies:
3.25 +
3.26 + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
3.27 + language that kind of looks like XML.
3.28 +
3.29 + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
3.30 + or invalid.
3.31 +
3.32 + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
3.33 + that trips up BeautifulSoup.
3.34 +
3.35 + * BeautifulSOAP, for making it easier to parse XML documents that use
3.36 + lots of subelements containing a single string, where you'd prefer
3.37 + they put that string into an attribute (such as SOAP messages).
3.38 +
3.39 +You can subclass BeautifulStoneSoup or BeautifulSoup to create a
3.40 +parsing strategy specific to an XML schema or a particular bizarre
3.41 +HTML document. Typically your subclass would just override
3.42 +SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
3.43 +"""
3.44 +from __future__ import generators
3.45 +
3.46 +__author__ = "Leonard Richardson (leonardr@segfault.org)"
3.47 +__version__ = "2.1.1"
3.48 +__date__ = "$Date: 2004/10/18 00:14:20 $"
3.49 +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
3.50 +__license__ = "PSF"
3.51 +
3.52 +from sgmllib import SGMLParser, SGMLParseError
3.53 +import types
3.54 +import re
3.55 +import sgmllib
3.56 +
3.57 +#This code makes Beautiful Soup able to parse XML with namespaces
3.58 +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
3.59 +
3.60 +class NullType(object):
3.61 +
3.62 + """Similar to NoneType with a corresponding singleton instance
3.63 + 'Null' that, unlike None, accepts any message and returns itself.
3.64 +
3.65 + Examples:
3.66 + >>> Null("send", "a", "message")("and one more",
3.67 + ... "and what you get still") is Null
3.68 + True
3.69 + """
3.70 +
3.71 + def __new__(cls): return Null
3.72 + def __call__(self, *args, **kwargs): return Null
3.73 +## def __getstate__(self, *args): return Null
3.74 + def __getattr__(self, attr): return Null
3.75 + def __getitem__(self, item): return Null
3.76 + def __setattr__(self, attr, value): pass
3.77 + def __setitem__(self, item, value): pass
3.78 + def __len__(self): return 0
3.79 + # FIXME: is this a python bug? otherwise ``for x in Null: pass``
3.80 + # never terminates...
3.81 + def __iter__(self): return iter([])
3.82 + def __contains__(self, item): return False
3.83 + def __repr__(self): return "Null"
3.84 +Null = object.__new__(NullType)
3.85 +
3.86 +class PageElement:
3.87 + """Contains the navigational information for some part of the page
3.88 + (either a tag or a piece of text)"""
3.89 +
3.90 + def setup(self, parent=Null, previous=Null):
3.91 + """Sets up the initial relations between this element and
3.92 + other elements."""
3.93 + self.parent = parent
3.94 + self.previous = previous
3.95 + self.next = Null
3.96 + self.previousSibling = Null
3.97 + self.nextSibling = Null
3.98 + if self.parent and self.parent.contents:
3.99 + self.previousSibling = self.parent.contents[-1]
3.100 + self.previousSibling.nextSibling = self
3.101 +
3.102 + def findNext(self, name=None, attrs={}, text=None):
3.103 + """Returns the first item that matches the given criteria and
3.104 + appears after this Tag in the document."""
3.105 + return self._first(self.fetchNext, name, attrs, text)
3.106 + firstNext = findNext
3.107 +
3.108 + def fetchNext(self, name=None, attrs={}, text=None, limit=None):
3.109 + """Returns all items that match the given criteria and appear
3.110 + before after Tag in the document."""
3.111 + return self._fetch(name, attrs, text, limit, self.nextGenerator)
3.112 +
3.113 + def findNextSibling(self, name=None, attrs={}, text=None):
3.114 + """Returns the closest sibling to this Tag that matches the
3.115 + given criteria and appears after this Tag in the document."""
3.116 + return self._first(self.fetchNextSiblings, name, attrs, text)
3.117 + firstNextSibling = findNextSibling
3.118 +
3.119 + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
3.120 + """Returns the siblings of this Tag that match the given
3.121 + criteria and appear after this Tag in the document."""
3.122 + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
3.123 +
3.124 + def findPrevious(self, name=None, attrs={}, text=None):
3.125 + """Returns the first item that matches the given criteria and
3.126 + appears before this Tag in the document."""
3.127 + return self._first(self.fetchPrevious, name, attrs, text)
3.128 +
3.129 + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
3.130 + """Returns all items that match the given criteria and appear
3.131 + before this Tag in the document."""
3.132 + return self._fetch(name, attrs, text, limit, self.previousGenerator)
3.133 + firstPrevious = findPrevious
3.134 +
3.135 + def findPreviousSibling(self, name=None, attrs={}, text=None):
3.136 + """Returns the closest sibling to this Tag that matches the
3.137 + given criteria and appears before this Tag in the document."""
3.138 + return self._first(self.fetchPreviousSiblings, name, attrs, text)
3.139 + firstPreviousSibling = findPreviousSibling
3.140 +
3.141 + def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
3.142 + limit=None):
3.143 + """Returns the siblings of this Tag that match the given
3.144 + criteria and appear before this Tag in the document."""
3.145 + return self._fetch(name, attrs, text, limit,
3.146 + self.previousSiblingGenerator)
3.147 +
3.148 + def findParent(self, name=None, attrs={}):
3.149 + """Returns the closest parent of this Tag that matches the given
3.150 + criteria."""
3.151 + r = Null
3.152 + l = self.fetchParents(name, attrs, 1)
3.153 + if l:
3.154 + r = l[0]
3.155 + return r
3.156 + firstParent = findParent
3.157 +
3.158 + def fetchParents(self, name=None, attrs={}, limit=None):
3.159 + """Returns the parents of this Tag that match the given
3.160 + criteria."""
3.161 + return self._fetch(name, attrs, None, limit, self.parentGenerator)
3.162 +
3.163 + #These methods do the real heavy lifting.
3.164 +
3.165 + def _first(self, method, name, attrs, text):
3.166 + r = Null
3.167 + l = method(name, attrs, text, 1)
3.168 + if l:
3.169 + r = l[0]
3.170 + return r
3.171 +
3.172 + def _fetch(self, name, attrs, text, limit, generator):
3.173 + "Iterates over a generator looking for things that match."
3.174 + if not hasattr(attrs, 'items'):
3.175 + attrs = {'class' : attrs}
3.176 +
3.177 + results = []
3.178 + g = generator()
3.179 + while True:
3.180 + try:
3.181 + i = g.next()
3.182 + except StopIteration:
3.183 + break
3.184 + found = None
3.185 + if isinstance(i, Tag):
3.186 + if not text:
3.187 + if not name or self._matches(i, name):
3.188 + match = True
3.189 + for attr, matchAgainst in attrs.items():
3.190 + check = i.get(attr)
3.191 + if not self._matches(check, matchAgainst):
3.192 + match = False
3.193 + break
3.194 + if match:
3.195 + found = i
3.196 + elif text:
3.197 + if self._matches(i, text):
3.198 + found = i
3.199 + if found:
3.200 + results.append(found)
3.201 + if limit and len(results) >= limit:
3.202 + break
3.203 + return results
3.204 +
3.205 + #Generators that can be used to navigate starting from both
3.206 + #NavigableTexts and Tags.
3.207 + def nextGenerator(self):
3.208 + i = self
3.209 + while i:
3.210 + i = i.next
3.211 + yield i
3.212 +
3.213 + def nextSiblingGenerator(self):
3.214 + i = self
3.215 + while i:
3.216 + i = i.nextSibling
3.217 + yield i
3.218 +
3.219 + def previousGenerator(self):
3.220 + i = self
3.221 + while i:
3.222 + i = i.previous
3.223 + yield i
3.224 +
3.225 + def previousSiblingGenerator(self):
3.226 + i = self
3.227 + while i:
3.228 + i = i.previousSibling
3.229 + yield i
3.230 +
3.231 + def parentGenerator(self):
3.232 + i = self
3.233 + while i:
3.234 + i = i.parent
3.235 + yield i
3.236 +
3.237 + def _matches(self, chunk, howToMatch):
3.238 + #print 'looking for %s in %s' % (howToMatch, chunk)
3.239 + #
3.240 + # If given a list of items, return true if the list contains a
3.241 + # text element that matches.
3.242 + if isList(chunk) and not isinstance(chunk, Tag):
3.243 + for tag in chunk:
3.244 + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
3.245 + return True
3.246 + return False
3.247 + if callable(howToMatch):
3.248 + return howToMatch(chunk)
3.249 + if isinstance(chunk, Tag):
3.250 + #Custom match methods take the tag as an argument, but all other
3.251 + #ways of matching match the tag name as a string
3.252 + chunk = chunk.name
3.253 + #Now we know that chunk is a string
3.254 + if not isinstance(chunk, basestring):
3.255 + chunk = str(chunk)
3.256 + if hasattr(howToMatch, 'match'):
3.257 + # It's a regexp object.
3.258 + return howToMatch.search(chunk)
3.259 + if isList(howToMatch):
3.260 + return chunk in howToMatch
3.261 + if hasattr(howToMatch, 'items'):
3.262 + return howToMatch.has_key(chunk)
3.263 + #It's just a string
3.264 + return str(howToMatch) == chunk
3.265 +
3.266 +class NavigableText(PageElement):
3.267 +
3.268 + def __getattr__(self, attr):
3.269 + "For backwards compatibility, text.string gives you text"
3.270 + if attr == 'string':
3.271 + return self
3.272 + else:
3.273 + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
3.274 +
3.275 +class NavigableString(str, NavigableText):
3.276 + pass
3.277 +
3.278 +class NavigableUnicodeString(unicode, NavigableText):
3.279 + pass
3.280 +
3.281 +class Tag(PageElement):
3.282 +
3.283 + """Represents a found HTML tag with its attributes and contents."""
3.284 +
3.285 + def __init__(self, name, attrs=None, parent=Null, previous=Null):
3.286 + "Basic constructor."
3.287 + self.name = name
3.288 + if attrs == None:
3.289 + attrs = []
3.290 + self.attrs = attrs
3.291 + self.contents = []
3.292 + self.setup(parent, previous)
3.293 + self.hidden = False
3.294 +
3.295 + def get(self, key, default=None):
3.296 + """Returns the value of the 'key' attribute for the tag, or
3.297 + the value given for 'default' if it doesn't have that
3.298 + attribute."""
3.299 + return self._getAttrMap().get(key, default)
3.300 +
3.301 + def __getitem__(self, key):
3.302 + """tag[key] returns the value of the 'key' attribute for the tag,
3.303 + and throws an exception if it's not there."""
3.304 + return self._getAttrMap()[key]
3.305 +
3.306 + def __iter__(self):
3.307 + "Iterating over a tag iterates over its contents."
3.308 + return iter(self.contents)
3.309 +
3.310 + def __len__(self):
3.311 + "The length of a tag is the length of its list of contents."
3.312 + return len(self.contents)
3.313 +
3.314 + def __contains__(self, x):
3.315 + return x in self.contents
3.316 +
3.317 + def __nonzero__(self):
3.318 + "A tag is non-None even if it has no contents."
3.319 + return True
3.320 +
3.321 + def __setitem__(self, key, value):
3.322 + """Setting tag[key] sets the value of the 'key' attribute for the
3.323 + tag."""
3.324 + self._getAttrMap()
3.325 + self.attrMap[key] = value
3.326 + found = False
3.327 + for i in range(0, len(self.attrs)):
3.328 + if self.attrs[i][0] == key:
3.329 + self.attrs[i] = (key, value)
3.330 + found = True
3.331 + if not found:
3.332 + self.attrs.append((key, value))
3.333 + self._getAttrMap()[key] = value
3.334 +
3.335 + def __delitem__(self, key):
3.336 + "Deleting tag[key] deletes all 'key' attributes for the tag."
3.337 + for item in self.attrs:
3.338 + if item[0] == key:
3.339 + self.attrs.remove(item)
3.340 + #We don't break because bad HTML can define the same
3.341 + #attribute multiple times.
3.342 + self._getAttrMap()
3.343 + if self.attrMap.has_key(key):
3.344 + del self.attrMap[key]
3.345 +
3.346 + def __call__(self, *args, **kwargs):
3.347 + """Calling a tag like a function is the same as calling its
3.348 + fetch() method. Eg. tag('a') returns a list of all the A tags
3.349 + found within this tag."""
3.350 + return apply(self.fetch, args, kwargs)
3.351 +
3.352 + def __getattr__(self, tag):
3.353 + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
3.354 + return self.first(tag[:-3])
3.355 + elif tag.find('__') != 0:
3.356 + return self.first(tag)
3.357 +
3.358 + def __eq__(self, other):
3.359 + """Returns true iff this tag has the same name, the same attributes,
3.360 + and the same contents (recursively) as the given tag.
3.361 +
3.362 + NOTE: right now this will return false if two tags have the
3.363 + same attributes in a different order. Should this be fixed?"""
3.364 + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
3.365 + return False
3.366 + for i in range(0, len(self.contents)):
3.367 + if self.contents[i] != other.contents[i]:
3.368 + return False
3.369 + return True
3.370 +
3.371 + def __ne__(self, other):
3.372 + """Returns true iff this tag is not identical to the other tag,
3.373 + as defined in __eq__."""
3.374 + return not self == other
3.375 +
3.376 + def __repr__(self):
3.377 + """Renders this tag as a string."""
3.378 + return str(self)
3.379 +
3.380 + def __unicode__(self):
3.381 + return self.__str__(1)
3.382 +
3.383 + def __str__(self, needUnicode=None, showStructureIndent=None):
3.384 + """Returns a string or Unicode representation of this tag and
3.385 + its contents.
3.386 +
3.387 + NOTE: since Python's HTML parser consumes whitespace, this
3.388 + method is not certain to reproduce the whitespace present in
3.389 + the original string."""
3.390 +
3.391 + attrs = []
3.392 + if self.attrs:
3.393 + for key, val in self.attrs:
3.394 + attrs.append('%s="%s"' % (key, val))
3.395 + close = ''
3.396 + closeTag = ''
3.397 + if self.isSelfClosing():
3.398 + close = ' /'
3.399 + else:
3.400 + closeTag = '</%s>' % self.name
3.401 + indentIncrement = None
3.402 + if showStructureIndent != None:
3.403 + indentIncrement = showStructureIndent
3.404 + if not self.hidden:
3.405 + indentIncrement += 1
3.406 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode)
3.407 + if showStructureIndent:
3.408 + space = '\n%s' % (' ' * showStructureIndent)
3.409 + if self.hidden:
3.410 + s = contents
3.411 + else:
3.412 + s = []
3.413 + attributeString = ''
3.414 + if attrs:
3.415 + attributeString = ' ' + ' '.join(attrs)
3.416 + if showStructureIndent:
3.417 + s.append(space)
3.418 + s.append('<%s%s%s>' % (self.name, attributeString, close))
3.419 + s.append(contents)
3.420 + if closeTag and showStructureIndent != None:
3.421 + s.append(space)
3.422 + s.append(closeTag)
3.423 + s = ''.join(s)
3.424 + isUnicode = type(s) == types.UnicodeType
3.425 + if needUnicode and not isUnicode:
3.426 + s = unicode(s)
3.427 + elif isUnicode and needUnicode==False:
3.428 + s = str(s)
3.429 + return s
3.430 +
3.431 + def prettify(self, needUnicode=None):
3.432 + return self.__str__(needUnicode, showStructureIndent=True)
3.433 +
3.434 + def renderContents(self, showStructureIndent=None, needUnicode=None):
3.435 + """Renders the contents of this tag as a (possibly Unicode)
3.436 + string."""
3.437 + s=[]
3.438 + for c in self:
3.439 + text = None
3.440 + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
3.441 + text = unicode(c)
3.442 + elif isinstance(c, Tag):
3.443 + s.append(c.__str__(needUnicode, showStructureIndent))
3.444 + elif needUnicode:
3.445 + text = unicode(c)
3.446 + else:
3.447 + text = str(c)
3.448 + if text:
3.449 + if showStructureIndent != None:
3.450 + if text[-1] == '\n':
3.451 + text = text[:-1]
3.452 + s.append(text)
3.453 + return ''.join(s)
3.454 +
3.455 + #Soup methods
3.456 +
3.457 + def firstText(self, text, recursive=True):
3.458 + """Convenience method to retrieve the first piece of text matching the
3.459 + given criteria. 'text' can be a string, a regular expression object,
3.460 + a callable that takes a string and returns whether or not the
3.461 + string 'matches', etc."""
3.462 + return self.first(recursive=recursive, text=text)
3.463 +
3.464 + def fetchText(self, text, recursive=True, limit=None):
3.465 + """Convenience method to retrieve all pieces of text matching the
3.466 + given criteria. 'text' can be a string, a regular expression object,
3.467 + a callable that takes a string and returns whether or not the
3.468 + string 'matches', etc."""
3.469 + return self.fetch(recursive=recursive, text=text, limit=limit)
3.470 +
3.471 + def first(self, name=None, attrs={}, recursive=True, text=None):
3.472 + """Return only the first child of this
3.473 + Tag matching the given criteria."""
3.474 + r = Null
3.475 + l = self.fetch(name, attrs, recursive, text, 1)
3.476 + if l:
3.477 + r = l[0]
3.478 + return r
3.479 + findChild = first
3.480 +
3.481 + def fetch(self, name=None, attrs={}, recursive=True, text=None,
3.482 + limit=None):
3.483 + """Extracts a list of Tag objects that match the given
3.484 + criteria. You can specify the name of the Tag and any
3.485 + attributes you want the Tag to have.
3.486 +
3.487 + The value of a key-value pair in the 'attrs' map can be a
3.488 + string, a list of strings, a regular expression object, or a
3.489 + callable that takes a string and returns whether or not the
3.490 + string matches for some custom definition of 'matches'. The
3.491 + same is true of the tag name."""
3.492 + generator = self.recursiveChildGenerator
3.493 + if not recursive:
3.494 + generator = self.childGenerator
3.495 + return self._fetch(name, attrs, text, limit, generator)
3.496 + fetchChildren = fetch
3.497 +
3.498 + #Utility methods
3.499 +
3.500 + def isSelfClosing(self):
3.501 + """Returns true iff this is a self-closing tag as defined in the HTML
3.502 + standard.
3.503 +
3.504 + TODO: This is specific to BeautifulSoup and its subclasses, but it's
3.505 + used by __str__"""
3.506 + return self.name in BeautifulSoup.SELF_CLOSING_TAGS
3.507 +
3.508 + def append(self, tag):
3.509 + """Appends the given tag to the contents of this tag."""
3.510 + self.contents.append(tag)
3.511 +
3.512 + #Private methods
3.513 +
3.514 + def _getAttrMap(self):
3.515 + """Initializes a map representation of this tag's attributes,
3.516 + if not already initialized."""
3.517 + if not getattr(self, 'attrMap'):
3.518 + self.attrMap = {}
3.519 + for (key, value) in self.attrs:
3.520 + self.attrMap[key] = value
3.521 + return self.attrMap
3.522 +
3.523 + #Generator methods
3.524 + def childGenerator(self):
3.525 + for i in range(0, len(self.contents)):
3.526 + yield self.contents[i]
3.527 + raise StopIteration
3.528 +
3.529 + def recursiveChildGenerator(self):
3.530 + stack = [(self, 0)]
3.531 + while stack:
3.532 + tag, start = stack.pop()
3.533 + if isinstance(tag, Tag):
3.534 + for i in range(start, len(tag.contents)):
3.535 + a = tag.contents[i]
3.536 + yield a
3.537 + if isinstance(a, Tag) and tag.contents:
3.538 + if i < len(tag.contents) - 1:
3.539 + stack.append((tag, i+1))
3.540 + stack.append((a, 0))
3.541 + break
3.542 + raise StopIteration
3.543 +
3.544 +
3.545 +def isList(l):
3.546 + """Convenience method that works with all 2.x versions of Python
3.547 + to determine whether or not something is listlike."""
3.548 + return hasattr(l, '__iter__') \
3.549 + or (type(l) in (types.ListType, types.TupleType))
3.550 +
3.551 +def buildTagMap(default, *args):
3.552 + """Turns a list of maps, lists, or scalars into a single map.
3.553 + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
3.554 + of lists and partial maps."""
3.555 + built = {}
3.556 + for portion in args:
3.557 + if hasattr(portion, 'items'):
3.558 + #It's a map. Merge it.
3.559 + for k,v in portion.items():
3.560 + built[k] = v
3.561 + elif isList(portion):
3.562 + #It's a list. Map each item to the default.
3.563 + for k in portion:
3.564 + built[k] = default
3.565 + else:
3.566 + #It's a scalar. Map it to the default.
3.567 + built[portion] = default
3.568 + return built
3.569 +
3.570 +class BeautifulStoneSoup(Tag, SGMLParser):
3.571 +
3.572 + """This class contains the basic parser and fetch code. It defines
3.573 + a parser that knows nothing about tag behavior except for the
3.574 + following:
3.575 +
3.576 + You can't close a tag without closing all the tags it encloses.
3.577 + That is, "<foo><bar></foo>" actually means
3.578 + "<foo><bar></bar></foo>".
3.579 +
3.580 + [Another possible explanation is "<foo><bar /></foo>", but since
3.581 + this class defines no SELF_CLOSING_TAGS, it will never use that
3.582 + explanation.]
3.583 +
3.584 + This class is useful for parsing XML or made-up markup languages,
3.585 + or when BeautifulSoup makes an assumption counter to what you were
3.586 + expecting."""
3.587 +
3.588 + SELF_CLOSING_TAGS = {}
3.589 + NESTABLE_TAGS = {}
3.590 + RESET_NESTING_TAGS = {}
3.591 + QUOTE_TAGS = {}
3.592 +
3.593 + #As a public service we will by default silently replace MS smart quotes
3.594 + #and similar characters with their HTML or ASCII equivalents.
3.595 + MS_CHARS = { '\x80' : '€',
3.596 + '\x81' : ' ',
3.597 + '\x82' : '‚',
3.598 + '\x83' : 'ƒ',
3.599 + '\x84' : '„',
3.600 + '\x85' : '…',
3.601 + '\x86' : '†',
3.602 + '\x87' : '‡',
3.603 + '\x88' : '⁁',
3.604 + '\x89' : '%',
3.605 + '\x8A' : 'Š',
3.606 + '\x8B' : '<',
3.607 + '\x8C' : 'Œ',
3.608 + '\x8D' : '?',
3.609 + '\x8E' : 'Z',
3.610 + '\x8F' : '?',
3.611 + '\x90' : '?',
3.612 + '\x91' : '‘',
3.613 + '\x92' : '’',
3.614 + '\x93' : '“',
3.615 + '\x94' : '”',
3.616 + '\x95' : '•',
3.617 + '\x96' : '–',
3.618 + '\x97' : '—',
3.619 + '\x98' : '˜',
3.620 + '\x99' : '™',
3.621 + '\x9a' : 'š',
3.622 + '\x9b' : '>',
3.623 + '\x9c' : 'œ',
3.624 + '\x9d' : '?',
3.625 + '\x9e' : 'z',
3.626 + '\x9f' : 'Ÿ',}
3.627 +
3.628 + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
3.629 + lambda(x):x.group(1) + ' />'),
3.630 + (re.compile('<!\s+([^<>]*)>'),
3.631 + lambda(x):'<!' + x.group(1) + '>'),
3.632 + (re.compile("([\x80-\x9f])"),
3.633 + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
3.634 + ]
3.635 +
3.636 + ROOT_TAG_NAME = '[document]'
3.637 +
3.638 + def __init__(self, text=None, avoidParserProblems=True,
3.639 + initialTextIsEverything=True):
3.640 + """Initialize this as the 'root tag' and feed in any text to
3.641 + the parser.
3.642 +
3.643 + NOTE about avoidParserProblems: sgmllib will process most bad
3.644 + HTML, and BeautifulSoup has tricks for dealing with some HTML
3.645 + that kills sgmllib, but Beautiful Soup can nonetheless choke
3.646 + or lose data if your data uses self-closing tags or
3.647 + declarations incorrectly. By default, Beautiful Soup sanitizes
3.648 + its input to avoid the vast majority of these problems. The
3.649 + problems are relatively rare, even in bad HTML, so feel free
3.650 + to pass in False to avoidParserProblems if they don't apply to
3.651 + you, and you'll get better performance. The only reason I have
3.652 + this turned on by default is so I don't get so many tech
3.653 + support questions.
3.654 +
3.655 + The two most common instances of invalid HTML that will choke
3.656 + sgmllib are fixed by the default parser massage techniques:
3.657 +
3.658 + <br/> (No space between name of closing tag and tag close)
3.659 + <! --Comment--> (Extraneous whitespace in declaration)
3.660 +
3.661 + You can pass in a custom list of (RE object, replace method)
3.662 + tuples to get Beautiful Soup to scrub your input the way you
3.663 + want."""
3.664 + Tag.__init__(self, self.ROOT_TAG_NAME)
3.665 + if avoidParserProblems \
3.666 + and not isList(avoidParserProblems):
3.667 + avoidParserProblems = self.PARSER_MASSAGE
3.668 + self.avoidParserProblems = avoidParserProblems
3.669 + SGMLParser.__init__(self)
3.670 + self.quoteStack = []
3.671 + self.hidden = 1
3.672 + self.reset()
3.673 + if hasattr(text, 'read'):
3.674 + #It's a file-type object.
3.675 + text = text.read()
3.676 + if text:
3.677 + self.feed(text)
3.678 + if initialTextIsEverything:
3.679 + self.done()
3.680 +
3.681 + def __getattr__(self, methodName):
3.682 + """This method routes method call requests to either the SGMLParser
3.683 + superclass or the Tag superclass, depending on the method name."""
3.684 + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
3.685 + or methodName.find('do_') == 0:
3.686 + return SGMLParser.__getattr__(self, methodName)
3.687 + elif methodName.find('__') != 0:
3.688 + return Tag.__getattr__(self, methodName)
3.689 + else:
3.690 + raise AttributeError
3.691 +
3.692 + def feed(self, text):
3.693 + if self.avoidParserProblems:
3.694 + for fix, m in self.avoidParserProblems:
3.695 + text = fix.sub(m, text)
3.696 + SGMLParser.feed(self, text)
3.697 +
3.698 + def done(self):
3.699 + """Called when you're done parsing, so that the unclosed tags can be
3.700 + correctly processed."""
3.701 + self.endData() #NEW
3.702 + while self.currentTag.name != self.ROOT_TAG_NAME:
3.703 + self.popTag()
3.704 +
3.705 + def reset(self):
3.706 + SGMLParser.reset(self)
3.707 + self.currentData = []
3.708 + self.currentTag = None
3.709 + self.tagStack = []
3.710 + self.pushTag(self)
3.711 +
3.712 + def popTag(self):
3.713 + tag = self.tagStack.pop()
3.714 + # Tags with just one string-owning child get the child as a
3.715 + # 'string' property, so that soup.tag.string is shorthand for
3.716 + # soup.tag.contents[0]
3.717 + if len(self.currentTag.contents) == 1 and \
3.718 + isinstance(self.currentTag.contents[0], NavigableText):
3.719 + self.currentTag.string = self.currentTag.contents[0]
3.720 +
3.721 + #print "Pop", tag.name
3.722 + if self.tagStack:
3.723 + self.currentTag = self.tagStack[-1]
3.724 + return self.currentTag
3.725 +
3.726 + def pushTag(self, tag):
3.727 + #print "Push", tag.name
3.728 + if self.currentTag:
3.729 + self.currentTag.append(tag)
3.730 + self.tagStack.append(tag)
3.731 + self.currentTag = self.tagStack[-1]
3.732 +
3.733 + def endData(self):
3.734 + currentData = ''.join(self.currentData)
3.735 + if currentData:
3.736 + if not currentData.strip():
3.737 + if '\n' in currentData:
3.738 + currentData = '\n'
3.739 + else:
3.740 + currentData = ' '
3.741 + c = NavigableString
3.742 + if type(currentData) == types.UnicodeType:
3.743 + c = NavigableUnicodeString
3.744 + o = c(currentData)
3.745 + o.setup(self.currentTag, self.previous)
3.746 + if self.previous:
3.747 + self.previous.next = o
3.748 + self.previous = o
3.749 + self.currentTag.contents.append(o)
3.750 + self.currentData = []
3.751 +
3.752 + def _popToTag(self, name, inclusivePop=True):
3.753 + """Pops the tag stack up to and including the most recent
3.754 + instance of the given tag. If inclusivePop is false, pops the tag
3.755 + stack up to but *not* including the most recent instqance of
3.756 + the given tag."""
3.757 + if name == self.ROOT_TAG_NAME:
3.758 + return
3.759 +
3.760 + numPops = 0
3.761 + mostRecentTag = None
3.762 + for i in range(len(self.tagStack)-1, 0, -1):
3.763 + if name == self.tagStack[i].name:
3.764 + numPops = len(self.tagStack)-i
3.765 + break
3.766 + if not inclusivePop:
3.767 + numPops = numPops - 1
3.768 +
3.769 + for i in range(0, numPops):
3.770 + mostRecentTag = self.popTag()
3.771 + return mostRecentTag
3.772 +
3.773 + def _smartPop(self, name):
3.774 +
3.775 + """We need to pop up to the previous tag of this type, unless
3.776 + one of this tag's nesting reset triggers comes between this
3.777 + tag and the previous tag of this type, OR unless this tag is a
3.778 + generic nesting trigger and another generic nesting trigger
3.779 + comes between this tag and the previous tag of this type.
3.780 +
3.781 + Examples:
3.782 + <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
3.783 + <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
3.784 + <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
3.785 + <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
3.786 +
3.787 + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
3.788 + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
3.789 + <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
3.790 + """
3.791 +
3.792 + nestingResetTriggers = self.NESTABLE_TAGS.get(name)
3.793 + isNestable = nestingResetTriggers != None
3.794 + isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
3.795 + popTo = None
3.796 + inclusive = True
3.797 + for i in range(len(self.tagStack)-1, 0, -1):
3.798 + p = self.tagStack[i]
3.799 + if (not p or p.name == name) and not isNestable:
3.800 + #Non-nestable tags get popped to the top or to their
3.801 + #last occurance.
3.802 + popTo = name
3.803 + break
3.804 + if (nestingResetTriggers != None
3.805 + and p.name in nestingResetTriggers) \
3.806 + or (nestingResetTriggers == None and isResetNesting
3.807 + and self.RESET_NESTING_TAGS.has_key(p.name)):
3.808 +
3.809 + #If we encounter one of the nesting reset triggers
3.810 + #peculiar to this tag, or we encounter another tag
3.811 + #that causes nesting to reset, pop up to but not
3.812 + #including that tag.
3.813 +
3.814 + popTo = p.name
3.815 + inclusive = False
3.816 + break
3.817 + p = p.parent
3.818 + if popTo:
3.819 + self._popToTag(popTo, inclusive)
3.820 +
3.821 + def unknown_starttag(self, name, attrs, selfClosing=0):
3.822 + #print "Start tag %s" % name
3.823 + if self.quoteStack:
3.824 + #This is not a real tag.
3.825 + #print "<%s> is not real!" % name
3.826 + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
3.827 + self.handle_data('<%s%s>' % (name, attrs))
3.828 + return
3.829 + self.endData()
3.830 + if not name in self.SELF_CLOSING_TAGS and not selfClosing:
3.831 + self._smartPop(name)
3.832 + tag = Tag(name, attrs, self.currentTag, self.previous)
3.833 + if self.previous:
3.834 + self.previous.next = tag
3.835 + self.previous = tag
3.836 + self.pushTag(tag)
3.837 + if selfClosing or name in self.SELF_CLOSING_TAGS:
3.838 + self.popTag()
3.839 + if name in self.QUOTE_TAGS:
3.840 + #print "Beginning quote (%s)" % name
3.841 + self.quoteStack.append(name)
3.842 + self.literal = 1
3.843 +
3.844 + def unknown_endtag(self, name):
3.845 + if self.quoteStack and self.quoteStack[-1] != name:
3.846 + #This is not a real end tag.
3.847 + #print "</%s> is not real!" % name
3.848 + self.handle_data('</%s>' % name)
3.849 + return
3.850 + self.endData()
3.851 + self._popToTag(name)
3.852 + if self.quoteStack and self.quoteStack[-1] == name:
3.853 + self.quoteStack.pop()
3.854 + self.literal = (len(self.quoteStack) > 0)
3.855 +
3.856 + def handle_data(self, data):
3.857 + self.currentData.append(data)
3.858 +
3.859 + def handle_pi(self, text):
3.860 + "Propagate processing instructions right through."
3.861 + self.handle_data("<?%s>" % text)
3.862 +
3.863 + def handle_comment(self, text):
3.864 + "Propagate comments right through."
3.865 + self.handle_data("<!--%s-->" % text)
3.866 +
3.867 + def handle_charref(self, ref):
3.868 + "Propagate char refs right through."
3.869 + self.handle_data('&#%s;' % ref)
3.870 +
3.871 + def handle_entityref(self, ref):
3.872 + "Propagate entity refs right through."
3.873 + self.handle_data('&%s;' % ref)
3.874 +
3.875 + def handle_decl(self, data):
3.876 + "Propagate DOCTYPEs and the like right through."
3.877 + self.handle_data('<!%s>' % data)
3.878 +
3.879 + def parse_declaration(self, i):
3.880 + """Treat a bogus SGML declaration as raw data. Treat a CDATA
3.881 + declaration as regular data."""
3.882 + j = None
3.883 + if self.rawdata[i:i+9] == '<![CDATA[':
3.884 + k = self.rawdata.find(']]>', i)
3.885 + if k == -1:
3.886 + k = len(self.rawdata)
3.887 + self.handle_data(self.rawdata[i+9:k])
3.888 + j = k+3
3.889 + else:
3.890 + try:
3.891 + j = SGMLParser.parse_declaration(self, i)
3.892 + except SGMLParseError:
3.893 + toHandle = self.rawdata[i:]
3.894 + self.handle_data(toHandle)
3.895 + j = i + len(toHandle)
3.896 + return j
3.897 +
3.898 +class BeautifulSoup(BeautifulStoneSoup):
3.899 +
3.900 + """This parser knows the following facts about HTML:
3.901 +
3.902 + * Some tags have no closing tag and should be interpreted as being
3.903 + closed as soon as they are encountered.
3.904 +
3.905 + * The text inside some tags (ie. 'script') may contain tags which
3.906 + are not really part of the document and which should be parsed
3.907 + as text, not tags. If you want to parse the text as tags, you can
3.908 + always fetch it and parse it explicitly.
3.909 +
3.910 + * Tag nesting rules:
3.911 +
3.912 + Most tags can't be nested at all. For instance, the occurance of
3.913 + a <p> tag should implicitly close the previous <p> tag.
3.914 +
3.915 + <p>Para1<p>Para2
3.916 + should be transformed into:
3.917 + <p>Para1</p><p>Para2
3.918 +
3.919 + Some tags can be nested arbitrarily. For instance, the occurance
3.920 + of a <blockquote> tag should _not_ implicitly close the previous
3.921 + <blockquote> tag.
3.922 +
3.923 + Alice said: <blockquote>Bob said: <blockquote>Blah
3.924 + should NOT be transformed into:
3.925 + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
3.926 +
3.927 + Some tags can be nested, but the nesting is reset by the
3.928 + interposition of other tags. For instance, a <tr> tag should
3.929 + implicitly close the previous <tr> tag within the same <table>,
3.930 + but not close a <tr> tag in another table.
3.931 +
3.932 + <table><tr>Blah<tr>Blah
3.933 + should be transformed into:
3.934 + <table><tr>Blah</tr><tr>Blah
3.935 + but,
3.936 + <tr>Blah<table><tr>Blah
3.937 + should NOT be transformed into
3.938 + <tr>Blah<table></tr><tr>Blah
3.939 +
3.940 + Differing assumptions about tag nesting rules are a major source
3.941 + of problems with the BeautifulSoup class. If BeautifulSoup is not
3.942 + treating as nestable a tag your page author treats as nestable,
3.943 + try ICantBelieveItsBeautifulSoup before writing your own
3.944 + subclass."""
3.945 +
3.946 + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
3.947 + 'spacer', 'link', 'frame', 'base'])
3.948 +
3.949 + QUOTE_TAGS = {'script': None}
3.950 +
3.951 + #According to the HTML standard, each of these inline tags can
3.952 + #contain another tag of the same type. Furthermore, it's common
3.953 + #to actually use these tags this way.
3.954 + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
3.955 + 'center']
3.956 +
3.957 + #According to the HTML standard, these block tags can contain
3.958 + #another tag of the same type. Furthermore, it's common
3.959 + #to actually use these tags this way.
3.960 + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
3.961 +
3.962 + #Lists can contain other lists, but there are restrictions.
3.963 + NESTABLE_LIST_TAGS = { 'ol' : [],
3.964 + 'ul' : [],
3.965 + 'li' : ['ul', 'ol'],
3.966 + 'dl' : [],
3.967 + 'dd' : ['dl'],
3.968 + 'dt' : ['dl'] }
3.969 +
3.970 + #Tables can contain other tables, but there are restrictions.
3.971 + NESTABLE_TABLE_TAGS = {'table' : [],
3.972 + 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
3.973 + 'td' : ['tr'],
3.974 + 'th' : ['tr'],
3.975 + }
3.976 +
3.977 + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
3.978 +
3.979 + #If one of these tags is encountered, all tags up to the next tag of
3.980 + #this type are popped.
3.981 + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
3.982 + NON_NESTABLE_BLOCK_TAGS,
3.983 + NESTABLE_LIST_TAGS,
3.984 + NESTABLE_TABLE_TAGS)
3.985 +
3.986 + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
3.987 + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
3.988 +
3.989 +class ICantBelieveItsBeautifulSoup(BeautifulSoup):
3.990 +
3.991 + """The BeautifulSoup class is oriented towards skipping over
3.992 + common HTML errors like unclosed tags. However, sometimes it makes
3.993 + errors of its own. For instance, consider this fragment:
3.994 +
3.995 + <b>Foo<b>Bar</b></b>
3.996 +
3.997 + This is perfectly valid (if bizarre) HTML. However, the
3.998 + BeautifulSoup class will implicitly close the first b tag when it
3.999 + encounters the second 'b'. It will think the author wrote
3.1000 + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
3.1001 + there's no real-world reason to bold something that's already
3.1002 + bold. When it encounters '</b></b>' it will close two more 'b'
3.1003 + tags, for a grand total of three tags closed instead of two. This
3.1004 + can throw off the rest of your document structure. The same is
3.1005 + true of a number of other tags, listed below.
3.1006 +
3.1007 + It's much more common for someone to forget to close (eg.) a 'b'
3.1008 + tag than to actually use nested 'b' tags, and the BeautifulSoup
3.1009 + class handles the common case. This class handles the
3.1010 + not-co-common case: where you can't believe someone wrote what
3.1011 + they did, but it's valid HTML and BeautifulSoup screwed up by
3.1012 + assuming it wouldn't be.
3.1013 +
3.1014 + If this doesn't do what you need, try subclassing this class or
3.1015 + BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
3.1016 +
3.1017 + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
3.1018 + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
3.1019 + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
3.1020 + 'big']
3.1021 +
3.1022 + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
3.1023 +
3.1024 + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
3.1025 + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
3.1026 + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
3.1027 +
3.1028 +class BeautifulSOAP(BeautifulStoneSoup):
3.1029 + """This class will push a tag with only a single string child into
3.1030 + the tag's parent as an attribute. The attribute's name is the tag
3.1031 + name, and the value is the string child. An example should give
3.1032 + the flavor of the change:
3.1033 +
3.1034 + <foo><bar>baz</bar></foo>
3.1035 + =>
3.1036 + <foo bar="baz"><bar>baz</bar></foo>
3.1037 +
3.1038 + You can then access fooTag['bar'] instead of fooTag.barTag.string.
3.1039 +
3.1040 + This is, of course, useful for scraping structures that tend to
3.1041 + use subelements instead of attributes, such as SOAP messages. Note
3.1042 + that it modifies its input, so don't print the modified version
3.1043 + out.
3.1044 +
3.1045 + I'm not sure how many people really want to use this class; let me
3.1046 + know if you do. Mainly I like the name."""
3.1047 +
3.1048 + def popTag(self):
3.1049 + if len(self.tagStack) > 1:
3.1050 + tag = self.tagStack[-1]
3.1051 + parent = self.tagStack[-2]
3.1052 + parent._getAttrMap()
3.1053 + if (isinstance(tag, Tag) and len(tag.contents) == 1 and
3.1054 + isinstance(tag.contents[0], NavigableText) and
3.1055 + not parent.attrMap.has_key(tag.name)):
3.1056 + parent[tag.name] = tag.contents[0]
3.1057 + BeautifulStoneSoup.popTag(self)
3.1058 +
3.1059 +#Enterprise class names! It has come to our attention that some people
3.1060 +#think the names of the Beautiful Soup parser classes are too silly
3.1061 +#and "unprofessional" for use in enterprise screen-scraping. We feel
3.1062 +#your pain! For such-minded folk, the Beautiful Soup Consortium And
3.1063 +#All-Night Kosher Bakery recommends renaming this file to
3.1064 +#"RobustParser.py" (or, in cases of extreme enterprisitude,
3.1065 +#"RobustParserBeanInterface.class") and using the following
3.1066 +#enterprise-friendly class aliases:
3.1067 +class RobustXMLParser(BeautifulStoneSoup):
3.1068 + pass
3.1069 +class RobustHTMLParser(BeautifulSoup):
3.1070 + pass
3.1071 +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
3.1072 + pass
3.1073 +class SimplifyingSOAPParser(BeautifulSOAP):
3.1074 + pass
3.1075 +
3.1076 +###
3.1077 +
3.1078 +
3.1079 +#By default, act as an HTML pretty-printer.
3.1080 +if __name__ == '__main__':
3.1081 + import sys
3.1082 + soup = BeautifulStoneSoup(sys.stdin.read())
3.1083 + print soup.prettify()
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/trunk/quahog/plugins/Weather/local/__init__.py Thu Oct 22 10:14:56 2009 -0400
4.3 @@ -0,0 +1,1 @@
4.4 +# Stub so local is a module, used for third-party modules
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/trunk/quahog/plugins/Weather/local/feedparser.py Thu Oct 22 10:14:56 2009 -0400
5.3 @@ -0,0 +1,2858 @@
5.4 +#!/usr/bin/env python
5.5 +"""Universal feed parser
5.6 +
5.7 +Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5.8 +
5.9 +Visit http://feedparser.org/ for the latest version
5.10 +Visit http://feedparser.org/docs/ for the latest documentation
5.11 +
5.12 +Required: Python 2.1 or later
5.13 +Recommended: Python 2.3 or later
5.14 +Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
5.15 +"""
5.16 +
5.17 +__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
5.18 +__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
5.19 +
5.20 +Redistribution and use in source and binary forms, with or without modification,
5.21 +are permitted provided that the following conditions are met:
5.22 +
5.23 +* Redistributions of source code must retain the above copyright notice,
5.24 + this list of conditions and the following disclaimer.
5.25 +* Redistributions in binary form must reproduce the above copyright notice,
5.26 + this list of conditions and the following disclaimer in the documentation
5.27 + and/or other materials provided with the distribution.
5.28 +
5.29 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
5.30 +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
5.31 +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5.32 +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
5.33 +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
5.34 +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
5.35 +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
5.36 +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
5.37 +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
5.38 +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
5.39 +POSSIBILITY OF SUCH DAMAGE."""
5.40 +__author__ = "Mark Pilgrim <http://diveintomark.org/>"
5.41 +__contributors__ = ["Jason Diamond <http://injektilo.org/>",
5.42 + "John Beimler <http://john.beimler.org/>",
5.43 + "Fazal Majid <http://www.majid.info/mylos/weblog/>",
5.44 + "Aaron Swartz <http://aaronsw.com/>",
5.45 + "Kevin Marks <http://epeus.blogspot.com/>"]
5.46 +_debug = 0
5.47 +
5.48 +# HTTP "User-Agent" header to send to servers when downloading feeds.
5.49 +# If you are embedding feedparser in a larger application, you should
5.50 +# change this to your application name and URL.
5.51 +USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
5.52 +
5.53 +# HTTP "Accept" header to send to servers when downloading feeds. If you don't
5.54 +# want to send an Accept header, set this to None.
5.55 +ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
5.56 +
5.57 +# List of preferred XML parsers, by SAX driver name. These will be tried first,
5.58 +# but if they're not installed, Python will keep searching through its own list
5.59 +# of pre-installed parsers until it finds one that supports everything we need.
5.60 +PREFERRED_XML_PARSERS = ["drv_libxml2"]
5.61 +
5.62 +# If you want feedparser to automatically run HTML markup through HTML Tidy, set
5.63 +# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
5.64 +# or utidylib <http://utidylib.berlios.de/>.
5.65 +TIDY_MARKUP = 0
5.66 +
5.67 +# List of Python interfaces for HTML Tidy, in order of preference. Only useful
5.68 +# if TIDY_MARKUP = 1
5.69 +PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
5.70 +
5.71 +# ---------- required modules (should come with any Python distribution) ----------
5.72 +import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
5.73 +try:
5.74 + from cStringIO import StringIO as _StringIO
5.75 +except:
5.76 + from StringIO import StringIO as _StringIO
5.77 +
5.78 +# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
5.79 +
5.80 +# gzip is included with most Python distributions, but may not be available if you compiled your own
5.81 +try:
5.82 + import gzip
5.83 +except:
5.84 + gzip = None
5.85 +try:
5.86 + import zlib
5.87 +except:
5.88 + zlib = None
5.89 +
5.90 +# If a real XML parser is available, feedparser will attempt to use it. feedparser has
5.91 +# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
5.92 +# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
5.93 +# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
5.94 +try:
5.95 + import xml.sax
5.96 + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
5.97 + from xml.sax.saxutils import escape as _xmlescape
5.98 + _XML_AVAILABLE = 1
5.99 +except:
5.100 + _XML_AVAILABLE = 0
5.101 + def _xmlescape(data):
5.102 + data = data.replace('&', '&')
5.103 + data = data.replace('>', '>')
5.104 + data = data.replace('<', '<')
5.105 + return data
5.106 +
5.107 +# base64 support for Atom feeds that contain embedded binary data
5.108 +try:
5.109 + import base64, binascii
5.110 +except:
5.111 + base64 = binascii = None
5.112 +
5.113 +# cjkcodecs and iconv_codec provide support for more character encodings.
5.114 +# Both are available from http://cjkpython.i18n.org/
5.115 +try:
5.116 + import cjkcodecs.aliases
5.117 +except:
5.118 + pass
5.119 +try:
5.120 + import iconv_codec
5.121 +except:
5.122 + pass
5.123 +
5.124 +# chardet library auto-detects character encodings
5.125 +# Download from http://chardet.feedparser.org/
5.126 +try:
5.127 + import chardet
5.128 + if _debug:
5.129 + import chardet.constants
5.130 + chardet.constants._debug = 1
5.131 +except:
5.132 + chardet = None
5.133 +
5.134 +# ---------- don't touch these ----------
5.135 +class ThingsNobodyCaresAboutButMe(Exception): pass
5.136 +class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
5.137 +class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
5.138 +class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
5.139 +class UndeclaredNamespace(Exception): pass
5.140 +
5.141 +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
5.142 +sgmllib.special = re.compile('<!')
5.143 +sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
5.144 +
5.145 +SUPPORTED_VERSIONS = {'': 'unknown',
5.146 + 'rss090': 'RSS 0.90',
5.147 + 'rss091n': 'RSS 0.91 (Netscape)',
5.148 + 'rss091u': 'RSS 0.91 (Userland)',
5.149 + 'rss092': 'RSS 0.92',
5.150 + 'rss093': 'RSS 0.93',
5.151 + 'rss094': 'RSS 0.94',
5.152 + 'rss20': 'RSS 2.0',
5.153 + 'rss10': 'RSS 1.0',
5.154 + 'rss': 'RSS (unknown version)',
5.155 + 'atom01': 'Atom 0.1',
5.156 + 'atom02': 'Atom 0.2',
5.157 + 'atom03': 'Atom 0.3',
5.158 + 'atom10': 'Atom 1.0',
5.159 + 'atom': 'Atom (unknown version)',
5.160 + 'cdf': 'CDF',
5.161 + 'hotrss': 'Hot RSS'
5.162 + }
5.163 +
5.164 +try:
5.165 + UserDict = dict
5.166 +except NameError:
5.167 + # Python 2.1 does not have dict
5.168 + from UserDict import UserDict
5.169 + def dict(aList):
5.170 + rc = {}
5.171 + for k, v in aList:
5.172 + rc[k] = v
5.173 + return rc
5.174 +
5.175 +class FeedParserDict(UserDict):
5.176 + keymap = {'channel': 'feed',
5.177 + 'items': 'entries',
5.178 + 'guid': 'id',
5.179 + 'date': 'updated',
5.180 + 'date_parsed': 'updated_parsed',
5.181 + 'description': ['subtitle', 'summary'],
5.182 + 'url': ['href'],
5.183 + 'modified': 'updated',
5.184 + 'modified_parsed': 'updated_parsed',
5.185 + 'issued': 'published',
5.186 + 'issued_parsed': 'published_parsed',
5.187 + 'copyright': 'rights',
5.188 + 'copyright_detail': 'rights_detail',
5.189 + 'tagline': 'subtitle',
5.190 + 'tagline_detail': 'subtitle_detail'}
5.191 + def __getitem__(self, key):
5.192 + if key == 'category':
5.193 + return UserDict.__getitem__(self, 'tags')[0]['term']
5.194 + if key == 'categories':
5.195 + return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
5.196 + realkey = self.keymap.get(key, key)
5.197 + if type(realkey) == types.ListType:
5.198 + for k in realkey:
5.199 + if UserDict.has_key(self, k):
5.200 + return UserDict.__getitem__(self, k)
5.201 + if UserDict.has_key(self, key):
5.202 + return UserDict.__getitem__(self, key)
5.203 + return UserDict.__getitem__(self, realkey)
5.204 +
5.205 + def __setitem__(self, key, value):
5.206 + for k in self.keymap.keys():
5.207 + if key == k:
5.208 + key = self.keymap[k]
5.209 + if type(key) == types.ListType:
5.210 + key = key[0]
5.211 + return UserDict.__setitem__(self, key, value)
5.212 +
5.213 + def get(self, key, default=None):
5.214 + if self.has_key(key):
5.215 + return self[key]
5.216 + else:
5.217 + return default
5.218 +
5.219 + def setdefault(self, key, value):
5.220 + if not self.has_key(key):
5.221 + self[key] = value
5.222 + return self[key]
5.223 +
5.224 + def has_key(self, key):
5.225 + try:
5.226 + return hasattr(self, key) or UserDict.has_key(self, key)
5.227 + except AttributeError:
5.228 + return False
5.229 +
5.230 + def __getattr__(self, key):
5.231 + try:
5.232 + return self.__dict__[key]
5.233 + except KeyError:
5.234 + pass
5.235 + try:
5.236 + assert not key.startswith('_')
5.237 + return self.__getitem__(key)
5.238 + except:
5.239 + raise AttributeError, "object has no attribute '%s'" % key
5.240 +
5.241 + def __setattr__(self, key, value):
5.242 + if key.startswith('_') or key == 'data':
5.243 + self.__dict__[key] = value
5.244 + else:
5.245 + return self.__setitem__(key, value)
5.246 +
5.247 + def __contains__(self, key):
5.248 + return self.has_key(key)
5.249 +
5.250 +def zopeCompatibilityHack():
5.251 + global FeedParserDict
5.252 + del FeedParserDict
5.253 + def FeedParserDict(aDict=None):
5.254 + rc = {}
5.255 + if aDict:
5.256 + rc.update(aDict)
5.257 + return rc
5.258 +
5.259 +_ebcdic_to_ascii_map = None
5.260 +def _ebcdic_to_ascii(s):
5.261 + global _ebcdic_to_ascii_map
5.262 + if not _ebcdic_to_ascii_map:
5.263 + emap = (
5.264 + 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
5.265 + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
5.266 + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
5.267 + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
5.268 + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
5.269 + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
5.270 + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
5.271 + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
5.272 + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
5.273 + 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
5.274 + 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
5.275 + 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
5.276 + 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
5.277 + 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
5.278 + 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
5.279 + 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
5.280 + )
5.281 + import string
5.282 + _ebcdic_to_ascii_map = string.maketrans( \
5.283 + ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
5.284 + return s.translate(_ebcdic_to_ascii_map)
5.285 +
5.286 +_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
5.287 +def _urljoin(base, uri):
5.288 + uri = _urifixer.sub(r'\1\3', uri)
5.289 + return urlparse.urljoin(base, uri)
5.290 +
5.291 +class _FeedParserMixin:
5.292 + namespaces = {'': '',
5.293 + 'http://backend.userland.com/rss': '',
5.294 + 'http://blogs.law.harvard.edu/tech/rss': '',
5.295 + 'http://purl.org/rss/1.0/': '',
5.296 + 'http://my.netscape.com/rdf/simple/0.9/': '',
5.297 + 'http://example.com/newformat#': '',
5.298 + 'http://example.com/necho': '',
5.299 + 'http://purl.org/echo/': '',
5.300 + 'uri/of/echo/namespace#': '',
5.301 + 'http://purl.org/pie/': '',
5.302 + 'http://purl.org/atom/ns#': '',
5.303 + 'http://www.w3.org/2005/Atom': '',
5.304 + 'http://purl.org/rss/1.0/modules/rss091#': '',
5.305 +
5.306 + 'http://webns.net/mvcb/': 'admin',
5.307 + 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
5.308 + 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
5.309 + 'http://media.tangent.org/rss/1.0/': 'audio',
5.310 + 'http://backend.userland.com/blogChannelModule': 'blogChannel',
5.311 + 'http://web.resource.org/cc/': 'cc',
5.312 + 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
5.313 + 'http://purl.org/rss/1.0/modules/company': 'co',
5.314 + 'http://purl.org/rss/1.0/modules/content/': 'content',
5.315 + 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
5.316 + 'http://purl.org/dc/elements/1.1/': 'dc',
5.317 + 'http://purl.org/dc/terms/': 'dcterms',
5.318 + 'http://purl.org/rss/1.0/modules/email/': 'email',
5.319 + 'http://purl.org/rss/1.0/modules/event/': 'ev',
5.320 + 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
5.321 + 'http://freshmeat.net/rss/fm/': 'fm',
5.322 + 'http://xmlns.com/foaf/0.1/': 'foaf',
5.323 + 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
5.324 + 'http://postneo.com/icbm/': 'icbm',
5.325 + 'http://purl.org/rss/1.0/modules/image/': 'image',
5.326 + 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
5.327 + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
5.328 + 'http://purl.org/rss/1.0/modules/link/': 'l',
5.329 + 'http://search.yahoo.com/mrss': 'media',
5.330 + 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
5.331 + 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
5.332 + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
5.333 + 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
5.334 + 'http://purl.org/rss/1.0/modules/reference/': 'ref',
5.335 + 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
5.336 + 'http://purl.org/rss/1.0/modules/search/': 'search',
5.337 + 'http://purl.org/rss/1.0/modules/slash/': 'slash',
5.338 + 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
5.339 + 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
5.340 + 'http://hacks.benhammersley.com/rss/streaming/': 'str',
5.341 + 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
5.342 + 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
5.343 + 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
5.344 + 'http://purl.org/rss/1.0/modules/threading/': 'thr',
5.345 + 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
5.346 + 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
5.347 + 'http://wellformedweb.org/commentAPI/': 'wfw',
5.348 + 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
5.349 + 'http://www.w3.org/1999/xhtml': 'xhtml',
5.350 + 'http://www.w3.org/XML/1998/namespace': 'xml',
5.351 + 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'
5.352 +}
5.353 + _matchnamespaces = {}
5.354 +
5.355 + can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
5.356 + can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
5.357 + can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
5.358 + html_types = ['text/html', 'application/xhtml+xml']
5.359 +
5.360 + def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
5.361 + if _debug: sys.stderr.write('initializing FeedParser\n')
5.362 + if not self._matchnamespaces:
5.363 + for k, v in self.namespaces.items():
5.364 + self._matchnamespaces[k.lower()] = v
5.365 + self.feeddata = FeedParserDict() # feed-level data
5.366 + self.encoding = encoding # character encoding
5.367 + self.entries = [] # list of entry-level data
5.368 + self.version = '' # feed type/version, see SUPPORTED_VERSIONS
5.369 + self.namespacesInUse = {} # dictionary of namespaces defined by the feed
5.370 +
5.371 + # the following are used internally to track state;
5.372 + # this is really out of control and should be refactored
5.373 + self.infeed = 0
5.374 + self.inentry = 0
5.375 + self.incontent = 0
5.376 + self.intextinput = 0
5.377 + self.inimage = 0
5.378 + self.inauthor = 0
5.379 + self.incontributor = 0
5.380 + self.inpublisher = 0
5.381 + self.insource = 0
5.382 + self.sourcedata = FeedParserDict()
5.383 + self.contentparams = FeedParserDict()
5.384 + self._summaryKey = None
5.385 + self.namespacemap = {}
5.386 + self.elementstack = []
5.387 + self.basestack = []
5.388 + self.langstack = []
5.389 + self.baseuri = baseuri or ''
5.390 + self.lang = baselang or None
5.391 + if baselang:
5.392 + self.feeddata['language'] = baselang
5.393 +
5.394 + def unknown_starttag(self, tag, attrs):
5.395 + if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
5.396 + # normalize attrs
5.397 + attrs = [(k.lower(), v) for k, v in attrs]
5.398 + attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
5.399 +
5.400 + # track xml:base and xml:lang
5.401 + attrsD = dict(attrs)
5.402 + baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
5.403 + self.baseuri = _urljoin(self.baseuri, baseuri)
5.404 + lang = attrsD.get('xml:lang', attrsD.get('lang'))
5.405 + if lang == '':
5.406 + # xml:lang could be explicitly set to '', we need to capture that
5.407 + lang = None
5.408 + elif lang is None:
5.409 + # if no xml:lang is specified, use parent lang
5.410 + lang = self.lang
5.411 + if lang:
5.412 + if tag in ('feed', 'rss', 'rdf:RDF'):
5.413 + self.feeddata['language'] = lang
5.414 + self.lang = lang
5.415 + self.basestack.append(self.baseuri)
5.416 + self.langstack.append(lang)
5.417 +
5.418 + # track namespaces
5.419 + for prefix, uri in attrs:
5.420 + if prefix.startswith('xmlns:'):
5.421 + self.trackNamespace(prefix[6:], uri)
5.422 + elif prefix == 'xmlns':
5.423 + self.trackNamespace(None, uri)
5.424 +
5.425 + # track inline content
5.426 + if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
5.427 + # element declared itself as escaped markup, but it isn't really
5.428 + self.contentparams['type'] = 'application/xhtml+xml'
5.429 + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
5.430 + # Note: probably shouldn't simply recreate localname here, but
5.431 + # our namespace handling isn't actually 100% correct in cases where
5.432 + # the feed redefines the default namespace (which is actually
5.433 + # the usual case for inline content, thanks Sam), so here we
5.434 + # cheat and just reconstruct the element based on localname
5.435 + # because that compensates for the bugs in our namespace handling.
5.436 + # This will horribly munge inline content with non-empty qnames,
5.437 + # but nobody actually does that, so I'm not fixing it.
5.438 + tag = tag.split(':')[-1]
5.439 + return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
5.440 +
5.441 + # match namespaces
5.442 + if tag.find(':') <> -1:
5.443 + prefix, suffix = tag.split(':', 1)
5.444 + else:
5.445 + prefix, suffix = '', tag
5.446 + prefix = self.namespacemap.get(prefix, prefix)
5.447 + if prefix:
5.448 + prefix = prefix + '_'
5.449 +
5.450 + # special hack for better tracking of empty textinput/image elements in illformed feeds
5.451 + if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
5.452 + self.intextinput = 0
5.453 + if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
5.454 + self.inimage = 0
5.455 +
5.456 + # call special handler (if defined) or default handler
5.457 + methodname = '_start_' + prefix + suffix
5.458 + try:
5.459 + method = getattr(self, methodname)
5.460 + return method(attrsD)
5.461 + except AttributeError:
5.462 + return self.push(prefix + suffix, 1)
5.463 +
5.464 + def unknown_endtag(self, tag):
5.465 + if _debug: sys.stderr.write('end %s\n' % tag)
5.466 + # match namespaces
5.467 + if tag.find(':') <> -1:
5.468 + prefix, suffix = tag.split(':', 1)
5.469 + else:
5.470 + prefix, suffix = '', tag
5.471 + prefix = self.namespacemap.get(prefix, prefix)
5.472 + if prefix:
5.473 + prefix = prefix + '_'
5.474 +
5.475 + # call special handler (if defined) or default handler
5.476 + methodname = '_end_' + prefix + suffix
5.477 + try:
5.478 + method = getattr(self, methodname)
5.479 + method()
5.480 + except AttributeError:
5.481 + self.pop(prefix + suffix)
5.482 +
5.483 + # track inline content
5.484 + if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
5.485 + # element declared itself as escaped markup, but it isn't really
5.486 + self.contentparams['type'] = 'application/xhtml+xml'
5.487 + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
5.488 + tag = tag.split(':')[-1]
5.489 + self.handle_data('</%s>' % tag, escape=0)
5.490 +
5.491 + # track xml:base and xml:lang going out of scope
5.492 + if self.basestack:
5.493 + self.basestack.pop()
5.494 + if self.basestack and self.basestack[-1]:
5.495 + self.baseuri = self.basestack[-1]
5.496 + if self.langstack:
5.497 + self.langstack.pop()
5.498 + if self.langstack: # and (self.langstack[-1] is not None):
5.499 + self.lang = self.langstack[-1]
5.500 +
5.501 + def handle_charref(self, ref):
5.502 + # called for each character reference, e.g. for ' ', ref will be '160'
5.503 + if not self.elementstack: return
5.504 + ref = ref.lower()
5.505 + if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
5.506 + text = '&#%s;' % ref
5.507 + else:
5.508 + if ref[0] == 'x':
5.509 + c = int(ref[1:], 16)
5.510 + else:
5.511 + c = int(ref)
5.512 + text = unichr(c).encode('utf-8')
5.513 + self.elementstack[-1][2].append(text)
5.514 +
5.515 + def handle_entityref(self, ref):
5.516 + # called for each entity reference, e.g. for '©', ref will be 'copy'
5.517 + if not self.elementstack: return
5.518 + if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
5.519 + if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
5.520 + text = '&%s;' % ref
5.521 + else:
5.522 + # entity resolution graciously donated by Aaron Swartz
5.523 + def name2cp(k):
5.524 + import htmlentitydefs
5.525 + if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
5.526 + return htmlentitydefs.name2codepoint[k]
5.527 + k = htmlentitydefs.entitydefs[k]
5.528 + if k.startswith('&#') and k.endswith(';'):
5.529 + return int(k[2:-1]) # not in latin-1
5.530 + return ord(k)
5.531 + try: name2cp(ref)
5.532 + except KeyError: text = '&%s;' % ref
5.533 + else: text = unichr(name2cp(ref)).encode('utf-8')
5.534 + self.elementstack[-1][2].append(text)
5.535 +
5.536 + def handle_data(self, text, escape=1):
5.537 + # called for each block of plain text, i.e. outside of any tag and
5.538 + # not containing any character or entity references
5.539 + if not self.elementstack: return
5.540 + if escape and self.contentparams.get('type') == 'application/xhtml+xml':
5.541 + text = _xmlescape(text)
5.542 + self.elementstack[-1][2].append(text)
5.543 +
5.544 + def handle_comment(self, text):
5.545 + # called for each comment, e.g. <!-- insert message here -->
5.546 + pass
5.547 +
5.548 + def handle_pi(self, text):
5.549 + # called for each processing instruction, e.g. <?instruction>
5.550 + pass
5.551 +
5.552 + def handle_decl(self, text):
5.553 + pass
5.554 +
5.555 + def parse_declaration(self, i):
5.556 + # override internal declaration handler to handle CDATA blocks
5.557 + if _debug: sys.stderr.write('entering parse_declaration\n')
5.558 + if self.rawdata[i:i+9] == '<![CDATA[':
5.559 + k = self.rawdata.find(']]>', i)
5.560 + if k == -1: k = len(self.rawdata)
5.561 + self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
5.562 + return k+3
5.563 + else:
5.564 + k = self.rawdata.find('>', i)
5.565 + return k+1
5.566 +
5.567 + def mapContentType(self, contentType):
5.568 + contentType = contentType.lower()
5.569 + if contentType == 'text':
5.570 + contentType = 'text/plain'
5.571 + elif contentType == 'html':
5.572 + contentType = 'text/html'
5.573 + elif contentType == 'xhtml':
5.574 + contentType = 'application/xhtml+xml'
5.575 + return contentType
5.576 +
5.577 + def trackNamespace(self, prefix, uri):
5.578 + loweruri = uri.lower()
5.579 + if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
5.580 + self.version = 'rss090'
5.581 + if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
5.582 + self.version = 'rss10'
5.583 + if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
5.584 + self.version = 'atom10'
5.585 + if loweruri.find('backend.userland.com/rss') <> -1:
5.586 + # match any backend.userland.com namespace
5.587 + uri = 'http://backend.userland.com/rss'
5.588 + loweruri = uri
5.589 + if self._matchnamespaces.has_key(loweruri):
5.590 + self.namespacemap[prefix] = self._matchnamespaces[loweruri]
5.591 + self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
5.592 + else:
5.593 + self.namespacesInUse[prefix or ''] = uri
5.594 +
5.595 + def resolveURI(self, uri):
5.596 + return _urljoin(self.baseuri or '', uri)
5.597 +
5.598 + def decodeEntities(self, element, data):
5.599 + return data
5.600 +
5.601 + def push(self, element, expectingText):
5.602 + self.elementstack.append([element, expectingText, []])
5.603 +
5.604 + def pop(self, element, stripWhitespace=1):
5.605 + if not self.elementstack: return
5.606 + if self.elementstack[-1][0] != element: return
5.607 +
5.608 + element, expectingText, pieces = self.elementstack.pop()
5.609 + output = ''.join(pieces)
5.610 + if stripWhitespace:
5.611 + output = output.strip()
5.612 + if not expectingText: return output
5.613 +
5.614 + # decode base64 content
5.615 + if base64 and self.contentparams.get('base64', 0):
5.616 + try:
5.617 + output = base64.decodestring(output)
5.618 + except binascii.Error:
5.619 + pass
5.620 + except binascii.Incomplete:
5.621 + pass
5.622 +
5.623 + # resolve relative URIs
5.624 + if (element in self.can_be_relative_uri) and output:
5.625 + output = self.resolveURI(output)
5.626 +
5.627 + # decode entities within embedded markup
5.628 + if not self.contentparams.get('base64', 0):
5.629 + output = self.decodeEntities(element, output)
5.630 +
5.631 + # remove temporary cruft from contentparams
5.632 + try:
5.633 + del self.contentparams['mode']
5.634 + except KeyError:
5.635 + pass
5.636 + try:
5.637 + del self.contentparams['base64']
5.638 + except KeyError:
5.639 + pass
5.640 +
5.641 + # resolve relative URIs within embedded markup
5.642 + if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
5.643 + if element in self.can_contain_relative_uris:
5.644 + output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
5.645 +
5.646 + # sanitize embedded markup
5.647 + if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
5.648 + if element in self.can_contain_dangerous_markup:
5.649 + output = _sanitizeHTML(output, self.encoding)
5.650 +
5.651 + if self.encoding and type(output) != type(u''):
5.652 + try:
5.653 + output = unicode(output, self.encoding)
5.654 + except:
5.655 + pass
5.656 +
5.657 + # categories/tags/keywords/whatever are handled in _end_category
5.658 + if element == 'category':
5.659 + return output
5.660 +
5.661 + # store output in appropriate place(s)
5.662 + if self.inentry and not self.insource:
5.663 + if element == 'content':
5.664 + self.entries[-1].setdefault(element, [])
5.665 + contentparams = copy.deepcopy(self.contentparams)
5.666 + contentparams['value'] = output
5.667 + self.entries[-1][element].append(contentparams)
5.668 + elif element == 'link':
5.669 + self.entries[-1][element] = output
5.670 + if output:
5.671 + self.entries[-1]['links'][-1]['href'] = output
5.672 + else:
5.673 + if element == 'description':
5.674 + element = 'summary'
5.675 + self.entries[-1][element] = output
5.676 + if self.incontent:
5.677 + contentparams = copy.deepcopy(self.contentparams)
5.678 + contentparams['value'] = output
5.679 + self.entries[-1][element + '_detail'] = contentparams
5.680 + elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
5.681 + context = self._getContext()
5.682 + if element == 'description':
5.683 + element = 'subtitle'
5.684 + context[element] = output
5.685 + if element == 'link':
5.686 + context['links'][-1]['href'] = output
5.687 + elif self.incontent:
5.688 + contentparams = copy.deepcopy(self.contentparams)
5.689 + contentparams['value'] = output
5.690 + context[element + '_detail'] = contentparams
5.691 + return output
5.692 +
5.693 + def pushContent(self, tag, attrsD, defaultContentType, expectingText):
5.694 + self.incontent += 1
5.695 + self.contentparams = FeedParserDict({
5.696 + 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
5.697 + 'language': self.lang,
5.698 + 'base': self.baseuri})
5.699 + self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
5.700 + self.push(tag, expectingText)
5.701 +
5.702 + def popContent(self, tag):
5.703 + value = self.pop(tag)
5.704 + self.incontent -= 1
5.705 + self.contentparams.clear()
5.706 + return value
5.707 +
5.708 + def _mapToStandardPrefix(self, name):
5.709 + colonpos = name.find(':')
5.710 + if colonpos <> -1:
5.711 + prefix = name[:colonpos]
5.712 + suffix = name[colonpos+1:]
5.713 + prefix = self.namespacemap.get(prefix, prefix)
5.714 + name = prefix + ':' + suffix
5.715 + return name
5.716 +
5.717 + def _getAttribute(self, attrsD, name):
5.718 + return attrsD.get(self._mapToStandardPrefix(name))
5.719 +
5.720 + def _isBase64(self, attrsD, contentparams):
5.721 + if attrsD.get('mode', '') == 'base64':
5.722 + return 1
5.723 + if self.contentparams['type'].startswith('text/'):
5.724 + return 0
5.725 + if self.contentparams['type'].endswith('+xml'):
5.726 + return 0
5.727 + if self.contentparams['type'].endswith('/xml'):
5.728 + return 0
5.729 + return 1
5.730 +
5.731 + def _itsAnHrefDamnIt(self, attrsD):
5.732 + href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
5.733 + if href:
5.734 + try:
5.735 + del attrsD['url']
5.736 + except KeyError:
5.737 + pass
5.738 + try:
5.739 + del attrsD['uri']
5.740 + except KeyError:
5.741 + pass
5.742 + attrsD['href'] = href
5.743 + return attrsD
5.744 +
5.745 + def _save(self, key, value):
5.746 + context = self._getContext()
5.747 + context.setdefault(key, value)
5.748 +
5.749 + def _start_rss(self, attrsD):
5.750 + versionmap = {'0.91': 'rss091u',
5.751 + '0.92': 'rss092',
5.752 + '0.93': 'rss093',
5.753 + '0.94': 'rss094'}
5.754 + if not self.version:
5.755 + attr_version = attrsD.get('version', '')
5.756 + version = versionmap.get(attr_version)
5.757 + if version:
5.758 + self.version = version
5.759 + elif attr_version.startswith('2.'):
5.760 + self.version = 'rss20'
5.761 + else:
5.762 + self.version = 'rss'
5.763 +
5.764 + def _start_dlhottitles(self, attrsD):
5.765 + self.version = 'hotrss'
5.766 +
5.767 + def _start_channel(self, attrsD):
5.768 + self.infeed = 1
5.769 + self._cdf_common(attrsD)
5.770 + _start_feedinfo = _start_channel
5.771 +
5.772 + def _cdf_common(self, attrsD):
5.773 + if attrsD.has_key('lastmod'):
5.774 + self._start_modified({})
5.775 + self.elementstack[-1][-1] = attrsD['lastmod']
5.776 + self._end_modified()
5.777 + if attrsD.has_key('href'):
5.778 + self._start_link({})
5.779 + self.elementstack[-1][-1] = attrsD['href']
5.780 + self._end_link()
5.781 +
5.782 + def _start_feed(self, attrsD):
5.783 + self.infeed = 1
5.784 + versionmap = {'0.1': 'atom01',
5.785 + '0.2': 'atom02',
5.786 + '0.3': 'atom03'}
5.787 + if not self.version:
5.788 + attr_version = attrsD.get('version')
5.789 + version = versionmap.get(attr_version)
5.790 + if version:
5.791 + self.version = version
5.792 + else:
5.793 + self.version = 'atom'
5.794 +
5.795 + def _end_channel(self):
5.796 + self.infeed = 0
5.797 + _end_feed = _end_channel
5.798 +
5.799 + def _start_image(self, attrsD):
5.800 + self.inimage = 1
5.801 + self.push('image', 0)
5.802 + context = self._getContext()
5.803 + context.setdefault('image', FeedParserDict())
5.804 +
5.805 + def _end_image(self):
5.806 + self.pop('image')
5.807 + self.inimage = 0
5.808 +
5.809 + def _start_textinput(self, attrsD):
5.810 + self.intextinput = 1
5.811 + self.push('textinput', 0)
5.812 + context = self._getContext()
5.813 + context.setdefault('textinput', FeedParserDict())
5.814 + _start_textInput = _start_textinput
5.815 +
5.816 + def _end_textinput(self):
5.817 + self.pop('textinput')
5.818 + self.intextinput = 0
5.819 + _end_textInput = _end_textinput
5.820 +
5.821 + def _start_author(self, attrsD):
5.822 + self.inauthor = 1
5.823 + self.push('author', 1)
5.824 + _start_managingeditor = _start_author
5.825 + _start_dc_author = _start_author
5.826 + _start_dc_creator = _start_author
5.827 + _start_itunes_author = _start_author
5.828 +
5.829 + def _end_author(self):
5.830 + self.pop('author')
5.831 + self.inauthor = 0
5.832 + self._sync_author_detail()
5.833 + _end_managingeditor = _end_author
5.834 + _end_dc_author = _end_author
5.835 + _end_dc_creator = _end_author
5.836 + _end_itunes_author = _end_author
5.837 +
5.838 + def _start_itunes_owner(self, attrsD):
5.839 + self.inpublisher = 1
5.840 + self.push('publisher', 0)
5.841 +
5.842 + def _end_itunes_owner(self):
5.843 + self.pop('publisher')
5.844 + self.inpublisher = 0
5.845 + self._sync_author_detail('publisher')
5.846 +
5.847 + def _start_contributor(self, attrsD):
5.848 + self.incontributor = 1
5.849 + context = self._getContext()
5.850 + context.setdefault('contributors', [])
5.851 + context['contributors'].append(FeedParserDict())
5.852 + self.push('contributor', 0)
5.853 +
5.854 + def _end_contributor(self):
5.855 + self.pop('contributor')
5.856 + self.incontributor = 0
5.857 +
5.858 + def _start_dc_contributor(self, attrsD):
5.859 + self.incontributor = 1
5.860 + context = self._getContext()
5.861 + context.setdefault('contributors', [])
5.862 + context['contributors'].append(FeedParserDict())
5.863 + self.push('name', 0)
5.864 +
5.865 + def _end_dc_contributor(self):
5.866 + self._end_name()
5.867 + self.incontributor = 0
5.868 +
5.869 + def _start_name(self, attrsD):
5.870 + self.push('name', 0)
5.871 + _start_itunes_name = _start_name
5.872 +
5.873 + def _end_name(self):
5.874 + value = self.pop('name')
5.875 + if self.inpublisher:
5.876 + self._save_author('name', value, 'publisher')
5.877 + elif self.inauthor:
5.878 + self._save_author('name', value)
5.879 + elif self.incontributor:
5.880 + self._save_contributor('name', value)
5.881 + elif self.intextinput:
5.882 + context = self._getContext()
5.883 + context['textinput']['name'] = value
5.884 + _end_itunes_name = _end_name
5.885 +
5.886 + def _start_width(self, attrsD):
5.887 + self.push('width', 0)
5.888 +
5.889 + def _end_width(self):
5.890 + value = self.pop('width')
5.891 + try:
5.892 + value = int(value)
5.893 + except:
5.894 + value = 0
5.895 + if self.inimage:
5.896 + context = self._getContext()
5.897 + context['image']['width'] = value
5.898 +
5.899 + def _start_height(self, attrsD):
5.900 + self.push('height', 0)
5.901 +
5.902 + def _end_height(self):
5.903 + value = self.pop('height')
5.904 + try:
5.905 + value = int(value)
5.906 + except:
5.907 + value = 0
5.908 + if self.inimage:
5.909 + context = self._getContext()
5.910 + context['image']['height'] = value
5.911 +
5.912 + def _start_url(self, attrsD):
5.913 + self.push('href', 1)
5.914 + _start_homepage = _start_url
5.915 + _start_uri = _start_url
5.916 +
5.917 + def _end_url(self):
5.918 + value = self.pop('href')
5.919 + if self.inauthor:
5.920 + self._save_author('href', value)
5.921 + elif self.incontributor:
5.922 + self._save_contributor('href', value)
5.923 + elif self.inimage:
5.924 + context = self._getContext()
5.925 + context['image']['href'] = value
5.926 + elif self.intextinput:
5.927 + context = self._getContext()
5.928 + context['textinput']['link'] = value
5.929 + _end_homepage = _end_url
5.930 + _end_uri = _end_url
5.931 +
5.932 + def _start_email(self, attrsD):
5.933 + self.push('email', 0)
5.934 + _start_itunes_email = _start_email
5.935 +
5.936 + def _end_email(self):
5.937 + value = self.pop('email')
5.938 + if self.inpublisher:
5.939 + self._save_author('email', value, 'publisher')
5.940 + elif self.inauthor:
5.941 + self._save_author('email', value)
5.942 + elif self.incontributor:
5.943 + self._save_contributor('email', value)
5.944 + _end_itunes_email = _end_email
5.945 +
5.946 + def _getContext(self):
5.947 + if self.insource:
5.948 + context = self.sourcedata
5.949 + elif self.inentry:
5.950 + context = self.entries[-1]
5.951 + else:
5.952 + context = self.feeddata
5.953 + return context
5.954 +
5.955 + def _save_author(self, key, value, prefix='author'):
5.956 + context = self._getContext()
5.957 + context.setdefault(prefix + '_detail', FeedParserDict())
5.958 + context[prefix + '_detail'][key] = value
5.959 + self._sync_author_detail()
5.960 +
5.961 + def _save_contributor(self, key, value):
5.962 + context = self._getContext()
5.963 + context.setdefault('contributors', [FeedParserDict()])
5.964 + context['contributors'][-1][key] = value
5.965 +
5.966 + def _sync_author_detail(self, key='author'):
5.967 + context = self._getContext()
5.968 + detail = context.get('%s_detail' % key)
5.969 + if detail:
5.970 + name = detail.get('name')
5.971 + email = detail.get('email')
5.972 + if name and email:
5.973 + context[key] = '%s (%s)' % (name, email)
5.974 + elif name:
5.975 + context[key] = name
5.976 + elif email:
5.977 + context[key] = email
5.978 + else:
5.979 + author = context.get(key)
5.980 + if not author: return
5.981 + emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
5.982 + if not emailmatch: return
5.983 + email = emailmatch.group(0)
5.984 + # probably a better way to do the following, but it passes all the tests
5.985 + author = author.replace(email, '')
5.986 + author = author.replace('()', '')
5.987 + author = author.strip()
5.988 + if author and (author[0] == '('):
5.989 + author = author[1:]
5.990 + if author and (author[-1] == ')'):
5.991 + author = author[:-1]
5.992 + author = author.strip()
5.993 + context.setdefault('%s_detail' % key, FeedParserDict())
5.994 + context['%s_detail' % key]['name'] = author
5.995 + context['%s_detail' % key]['email'] = email
5.996 +
5.997 + def _start_subtitle(self, attrsD):
5.998 + self.pushContent('subtitle', attrsD, 'text/plain', 1)
5.999 + _start_tagline = _start_subtitle
5.1000 + _start_itunes_subtitle = _start_subtitle
5.1001 +
5.1002 + def _end_subtitle(self):
5.1003 + self.popContent('subtitle')
5.1004 + _end_tagline = _end_subtitle
5.1005 + _end_itunes_subtitle = _end_subtitle
5.1006 +
5.1007 + def _start_rights(self, attrsD):
5.1008 + self.pushContent('rights', attrsD, 'text/plain', 1)
5.1009 + _start_dc_rights = _start_rights
5.1010 + _start_copyright = _start_rights
5.1011 +
5.1012 + def _end_rights(self):
5.1013 + self.popContent('rights')
5.1014 + _end_dc_rights = _end_rights
5.1015 + _end_copyright = _end_rights
5.1016 +
5.1017 + def _start_item(self, attrsD):
5.1018 + self.entries.append(FeedParserDict())
5.1019 + self.push('item', 0)
5.1020 + self.inentry = 1
5.1021 + self.guidislink = 0
5.1022 + id = self._getAttribute(attrsD, 'rdf:about')
5.1023 + if id:
5.1024 + context = self._getContext()
5.1025 + context['id'] = id
5.1026 + self._cdf_common(attrsD)
5.1027 + _start_entry = _start_item
5.1028 + _start_product = _start_item
5.1029 +
5.1030 + def _end_item(self):
5.1031 + self.pop('item')
5.1032 + self.inentry = 0
5.1033 + _end_entry = _end_item
5.1034 +
5.1035 + def _start_dc_language(self, attrsD):
5.1036 + self.push('language', 1)
5.1037 + _start_language = _start_dc_language
5.1038 +
5.1039 + def _end_dc_language(self):
5.1040 + self.lang = self.pop('language')
5.1041 + _end_language = _end_dc_language
5.1042 +
5.1043 + def _start_dc_publisher(self, attrsD):
5.1044 + self.push('publisher', 1)
5.1045 + _start_webmaster = _start_dc_publisher
5.1046 +
5.1047 + def _end_dc_publisher(self):
5.1048 + self.pop('publisher')
5.1049 + self._sync_author_detail('publisher')
5.1050 + _end_webmaster = _end_dc_publisher
5.1051 +
5.1052 + def _start_published(self, attrsD):
5.1053 + self.push('published', 1)
5.1054 + _start_dcterms_issued = _start_published
5.1055 + _start_issued = _start_published
5.1056 +
5.1057 + def _end_published(self):
5.1058 + value = self.pop('published')
5.1059 + self._save('published_parsed', _parse_date(value))
5.1060 + _end_dcterms_issued = _end_published
5.1061 + _end_issued = _end_published
5.1062 +
5.1063 + def _start_updated(self, attrsD):
5.1064 + self.push('updated', 1)
5.1065 + _start_modified = _start_updated
5.1066 + _start_dcterms_modified = _start_updated
5.1067 + _start_pubdate = _start_updated
5.1068 + _start_dc_date = _start_updated
5.1069 +
5.1070 + def _end_updated(self):
5.1071 + value = self.pop('updated')
5.1072 + parsed_value = _parse_date(value)
5.1073 + self._save('updated_parsed', parsed_value)
5.1074 + _end_modified = _end_updated
5.1075 + _end_dcterms_modified = _end_updated
5.1076 + _end_pubdate = _end_updated
5.1077 + _end_dc_date = _end_updated
5.1078 +
5.1079 + def _start_created(self, attrsD):
5.1080 + self.push('created', 1)
5.1081 + _start_dcterms_created = _start_created
5.1082 +
5.1083 + def _end_created(self):
5.1084 + value = self.pop('created')
5.1085 + self._save('created_parsed', _parse_date(value))
5.1086 + _end_dcterms_created = _end_created
5.1087 +
5.1088 + def _start_expirationdate(self, attrsD):
5.1089 + self.push('expired', 1)
5.1090 +
5.1091 + def _end_expirationdate(self):
5.1092 + self._save('expired_parsed', _parse_date(self.pop('expired')))
5.1093 +
5.1094 + def _start_cc_license(self, attrsD):
5.1095 + self.push('license', 1)
5.1096 + value = self._getAttribute(attrsD, 'rdf:resource')
5.1097 + if value:
5.1098 + self.elementstack[-1][2].append(value)
5.1099 + self.pop('license')
5.1100 +
5.1101 + def _start_creativecommons_license(self, attrsD):
5.1102 + self.push('license', 1)
5.1103 +
5.1104 + def _end_creativecommons_license(self):
5.1105 + self.pop('license')
5.1106 +
5.1107 + def _addTag(self, term, scheme, label):
5.1108 + context = self._getContext()
5.1109 + tags = context.setdefault('tags', [])
5.1110 + if (not term) and (not scheme) and (not label): return
5.1111 + value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
5.1112 + if value not in tags:
5.1113 + tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
5.1114 +
5.1115 + def _start_category(self, attrsD):
5.1116 + if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
5.1117 + term = attrsD.get('term')
5.1118 + scheme = attrsD.get('scheme', attrsD.get('domain'))
5.1119 + label = attrsD.get('label')
5.1120 + self._addTag(term, scheme, label)
5.1121 + self.push('category', 1)
5.1122 + _start_dc_subject = _start_category
5.1123 + _start_keywords = _start_category
5.1124 +
5.1125 + def _end_itunes_keywords(self):
5.1126 + for term in self.pop('itunes_keywords').split():
5.1127 + self._addTag(term, 'http://www.itunes.com/', None)
5.1128 +
5.1129 + def _start_itunes_category(self, attrsD):
5.1130 + self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
5.1131 + self.push('category', 1)
5.1132 +
5.1133 + def _end_category(self):
5.1134 + value = self.pop('category')
5.1135 + if not value: return
5.1136 + context = self._getContext()
5.1137 + tags = context['tags']
5.1138 + if value and len(tags) and not tags[-1]['term']:
5.1139 + tags[-1]['term'] = value
5.1140 + else:
5.1141 + self._addTag(value, None, None)
5.1142 + _end_dc_subject = _end_category
5.1143 + _end_keywords = _end_category
5.1144 + _end_itunes_category = _end_category
5.1145 +
5.1146 + def _start_cloud(self, attrsD):
5.1147 + self._getContext()['cloud'] = FeedParserDict(attrsD)
5.1148 +
5.1149 + def _start_link(self, attrsD):
5.1150 + attrsD.setdefault('rel', 'alternate')
5.1151 + attrsD.setdefault('type', 'text/html')
5.1152 + attrsD = self._itsAnHrefDamnIt(attrsD)
5.1153 + if attrsD.has_key('href'):
5.1154 + attrsD['href'] = self.resolveURI(attrsD['href'])
5.1155 + expectingText = self.infeed or self.inentry or self.insource
5.1156 + context = self._getContext()
5.1157 + context.setdefault('links', [])
5.1158 + context['links'].append(FeedParserDict(attrsD))
5.1159 + if attrsD['rel'] == 'enclosure':
5.1160 + self._start_enclosure(attrsD)
5.1161 + if attrsD.has_key('href'):
5.1162 + expectingText = 0
5.1163 + if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
5.1164 + context['link'] = attrsD['href']
5.1165 + else:
5.1166 + self.push('link', expectingText)
5.1167 + _start_producturl = _start_link
5.1168 +
5.1169 + def _end_link(self):
5.1170 + value = self.pop('link')
5.1171 + context = self._getContext()
5.1172 + if self.intextinput:
5.1173 + context['textinput']['link'] = value
5.1174 + if self.inimage:
5.1175 + context['image']['link'] = value
5.1176 + _end_producturl = _end_link
5.1177 +
5.1178 + def _start_guid(self, attrsD):
5.1179 + self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
5.1180 + self.push('id', 1)
5.1181 +
5.1182 + def _end_guid(self):
5.1183 + value = self.pop('id')
5.1184 + self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
5.1185 + if self.guidislink:
5.1186 + # guid acts as link, but only if 'ispermalink' is not present or is 'true',
5.1187 + # and only if the item doesn't already have a link element
5.1188 + self._save('link', value)
5.1189 +
5.1190 + def _start_title(self, attrsD):
5.1191 + self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
5.1192 + _start_dc_title = _start_title
5.1193 + _start_media_title = _start_title
5.1194 +
5.1195 + def _end_title(self):
5.1196 + value = self.popContent('title')
5.1197 + context = self._getContext()
5.1198 + if self.intextinput:
5.1199 + context['textinput']['title'] = value
5.1200 + elif self.inimage:
5.1201 + context['image']['title'] = value
5.1202 + _end_dc_title = _end_title
5.1203 + _end_media_title = _end_title
5.1204 +
5.1205 + def _start_description(self, attrsD):
5.1206 + context = self._getContext()
5.1207 + if context.has_key('summary'):
5.1208 + self._summaryKey = 'content'
5.1209 + self._start_content(attrsD)
5.1210 + else:
5.1211 + self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
5.1212 +
5.1213 + def _start_abstract(self, attrsD):
5.1214 + self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
5.1215 +
5.1216 + def _end_description(self):
5.1217 + if self._summaryKey == 'content':
5.1218 + self._end_content()
5.1219 + else:
5.1220 + value = self.popContent('description')
5.1221 + context = self._getContext()
5.1222 + if self.intextinput:
5.1223 + context['textinput']['description'] = value
5.1224 + elif self.inimage:
5.1225 + context['image']['description'] = value
5.1226 + self._summaryKey = None
5.1227 + _end_abstract = _end_description
5.1228 +
5.1229 + def _start_info(self, attrsD):
5.1230 + self.pushContent('info', attrsD, 'text/plain', 1)
5.1231 + _start_feedburner_browserfriendly = _start_info
5.1232 +
5.1233 + def _end_info(self):
5.1234 + self.popContent('info')
5.1235 + _end_feedburner_browserfriendly = _end_info
5.1236 +
5.1237 + def _start_generator(self, attrsD):
5.1238 + if attrsD:
5.1239 + attrsD = self._itsAnHrefDamnIt(attrsD)
5.1240 + if attrsD.has_key('href'):
5.1241 + attrsD['href'] = self.resolveURI(attrsD['href'])
5.1242 + self._getContext()['generator_detail'] = FeedParserDict(attrsD)
5.1243 + self.push('generator', 1)
5.1244 +
5.1245 + def _end_generator(self):
5.1246 + value = self.pop('generator')
5.1247 + context = self._getContext()
5.1248 + if context.has_key('generator_detail'):
5.1249 + context['generator_detail']['name'] = value
5.1250 +
5.1251 + def _start_admin_generatoragent(self, attrsD):
5.1252 + self.push('generator', 1)
5.1253 + value = self._getAttribute(attrsD, 'rdf:resource')
5.1254 + if value:
5.1255 + self.elementstack[-1][2].append(value)
5.1256 + self.pop('generator')
5.1257 + self._getContext()['generator_detail'] = FeedParserDict({'href': value})
5.1258 +
5.1259 + def _start_admin_errorreportsto(self, attrsD):
5.1260 + self.push('errorreportsto', 1)
5.1261 + value = self._getAttribute(attrsD, 'rdf:resource')
5.1262 + if value:
5.1263 + self.elementstack[-1][2].append(value)
5.1264 + self.pop('errorreportsto')
5.1265 +
5.1266 + def _start_summary(self, attrsD):
5.1267 + context = self._getContext()
5.1268 + if context.has_key('summary'):
5.1269 + self._summaryKey = 'content'
5.1270 + self._start_content(attrsD)
5.1271 + else:
5.1272 + self._summaryKey = 'summary'
5.1273 + self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
5.1274 + _start_itunes_summary = _start_summary
5.1275 +
5.1276 + def _end_summary(self):
5.1277 + if self._summaryKey == 'content':
5.1278 + self._end_content()
5.1279 + else:
5.1280 + self.popContent(self._summaryKey or 'summary')
5.1281 + self._summaryKey = None
5.1282 + _end_itunes_summary = _end_summary
5.1283 +
5.1284 + def _start_enclosure(self, attrsD):
5.1285 + attrsD = self._itsAnHrefDamnIt(attrsD)
5.1286 + self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
5.1287 + href = attrsD.get('href')
5.1288 + if href:
5.1289 + context = self._getContext()
5.1290 + if not context.get('id'):
5.1291 + context['id'] = href
5.1292 +
5.1293 + def _start_source(self, attrsD):
5.1294 + self.insource = 1
5.1295 +
5.1296 + def _end_source(self):
5.1297 + self.insource = 0
5.1298 + self._getContext()['source'] = copy.deepcopy(self.sourcedata)
5.1299 + self.sourcedata.clear()
5.1300 +
5.1301 + def _start_content(self, attrsD):
5.1302 + self.pushContent('content', attrsD, 'text/plain', 1)
5.1303 + src = attrsD.get('src')
5.1304 + if src:
5.1305 + self.contentparams['src'] = src
5.1306 + self.push('content', 1)
5.1307 +
5.1308 + def _start_prodlink(self, attrsD):
5.1309 + self.pushContent('content', attrsD, 'text/html', 1)
5.1310 +
5.1311 + def _start_body(self, attrsD):
5.1312 + self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
5.1313 + _start_xhtml_body = _start_body
5.1314 +
5.1315 + def _start_content_encoded(self, attrsD):
5.1316 + self.pushContent('content', attrsD, 'text/html', 1)
5.1317 + _start_fullitem = _start_content_encoded
5.1318 +
5.1319 + def _end_content(self):
5.1320 + copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
5.1321 + value = self.popContent('content')
5.1322 + if copyToDescription:
5.1323 + self._save('description', value)
5.1324 + _end_body = _end_content
5.1325 + _end_xhtml_body = _end_content
5.1326 + _end_content_encoded = _end_content
5.1327 + _end_fullitem = _end_content
5.1328 + _end_prodlink = _end_content
5.1329 +
5.1330 + def _start_itunes_image(self, attrsD):
5.1331 + self.push('itunes_image', 0)
5.1332 + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
5.1333 + _start_itunes_link = _start_itunes_image
5.1334 +
5.1335 + def _end_itunes_block(self):
5.1336 + value = self.pop('itunes_block', 0)
5.1337 + self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
5.1338 +
5.1339 + def _end_itunes_explicit(self):
5.1340 + value = self.pop('itunes_explicit', 0)
5.1341 + self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
5.1342 +
5.1343 +if _XML_AVAILABLE:
5.1344 + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
5.1345 + def __init__(self, baseuri, baselang, encoding):
5.1346 + if _debug: sys.stderr.write('trying StrictFeedParser\n')
5.1347 + xml.sax.handler.ContentHandler.__init__(self)
5.1348 + _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
5.1349 + self.bozo = 0
5.1350 + self.exc = None
5.1351 +
5.1352 + def startPrefixMapping(self, prefix, uri):
5.1353 + self.trackNamespace(prefix, uri)
5.1354 +
5.1355 + def startElementNS(self, name, qname, attrs):
5.1356 + namespace, localname = name
5.1357 + lowernamespace = str(namespace or '').lower()
5.1358 + if lowernamespace.find('backend.userland.com/rss') <> -1:
5.1359 + # match any backend.userland.com namespace
5.1360 + namespace = 'http://backend.userland.com/rss'
5.1361 + lowernamespace = namespace
5.1362 + if qname and qname.find(':') > 0:
5.1363 + givenprefix = qname.split(':')[0]
5.1364 + else:
5.1365 + givenprefix = None
5.1366 + prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
5.1367 + if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
5.1368 + raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
5.1369 + if prefix:
5.1370 + localname = prefix + ':' + localname
5.1371 + localname = str(localname).lower()
5.1372 + if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
5.1373 +
5.1374 + # qname implementation is horribly broken in Python 2.1 (it
5.1375 + # doesn't report any), and slightly broken in Python 2.2 (it
5.1376 + # doesn't report the xml: namespace). So we match up namespaces
5.1377 + # with a known list first, and then possibly override them with
5.1378 + # the qnames the SAX parser gives us (if indeed it gives us any
5.1379 + # at all). Thanks to MatejC for helping me test this and
5.1380 + # tirelessly telling me that it didn't work yet.
5.1381 + attrsD = {}
5.1382 + for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
5.1383 + lowernamespace = (namespace or '').lower()
5.1384 + prefix = self._matchnamespaces.get(lowernamespace, '')
5.1385 + if prefix:
5.1386 + attrlocalname = prefix + ':' + attrlocalname
5.1387 + attrsD[str(attrlocalname).lower()] = attrvalue
5.1388 + for qname in attrs.getQNames():
5.1389 + attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
5.1390 + self.unknown_starttag(localname, attrsD.items())
5.1391 +
5.1392 + def characters(self, text):
5.1393 + self.handle_data(text)
5.1394 +
5.1395 + def endElementNS(self, name, qname):
5.1396 + namespace, localname = name
5.1397 + lowernamespace = str(namespace or '').lower()
5.1398 + if qname and qname.find(':') > 0:
5.1399 + givenprefix = qname.split(':')[0]
5.1400 + else:
5.1401 + givenprefix = ''
5.1402 + prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
5.1403 + if prefix:
5.1404 + localname = prefix + ':' + localname
5.1405 + localname = str(localname).lower()
5.1406 + self.unknown_endtag(localname)
5.1407 +
5.1408 + def error(self, exc):
5.1409 + self.bozo = 1
5.1410 + self.exc = exc
5.1411 +
5.1412 + def fatalError(self, exc):
5.1413 + self.error(exc)
5.1414 + raise exc
5.1415 +
5.1416 +class _BaseHTMLProcessor(sgmllib.SGMLParser):
5.1417 + elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
5.1418 + 'img', 'input', 'isindex', 'link', 'meta', 'param']
5.1419 +
5.1420 + def __init__(self, encoding):
5.1421 + self.encoding = encoding
5.1422 + if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
5.1423 + sgmllib.SGMLParser.__init__(self)
5.1424 +
5.1425 + def reset(self):
5.1426 + self.pieces = []
5.1427 + sgmllib.SGMLParser.reset(self)
5.1428 +
5.1429 + def _shorttag_replace(self, match):
5.1430 + tag = match.group(1)
5.1431 + if tag in self.elements_no_end_tag:
5.1432 + return '<' + tag + ' />'
5.1433 + else:
5.1434 + return '<' + tag + '></' + tag + '>'
5.1435 +
5.1436 + def feed(self, data):
5.1437 + data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
5.1438 + #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
5.1439 + data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
5.1440 + data = data.replace(''', "'")
5.1441 + data = data.replace('"', '"')
5.1442 + if self.encoding and type(data) == type(u''):
5.1443 + data = data.encode(self.encoding)
5.1444 + sgmllib.SGMLParser.feed(self, data)
5.1445 +
5.1446 + def normalize_attrs(self, attrs):
5.1447 + # utility method to be called by descendants
5.1448 + attrs = [(k.lower(), v) for k, v in attrs]
5.1449 + attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
5.1450 + return attrs
5.1451 +
5.1452 + def unknown_starttag(self, tag, attrs):
5.1453 + # called for each start tag
5.1454 + # attrs is a list of (attr, value) tuples
5.1455 + # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
5.1456 + if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
5.1457 + uattrs = []
5.1458 + # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
5.1459 + for key, value in attrs:
5.1460 + if type(value) != type(u''):
5.1461 + value = unicode(value, self.encoding)
5.1462 + uattrs.append((unicode(key, self.encoding), value))
5.1463 + strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
5.1464 + if tag in self.elements_no_end_tag:
5.1465 + self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
5.1466 + else:
5.1467 + self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
5.1468 +
5.1469 + def unknown_endtag(self, tag):
5.1470 + # called for each end tag, e.g. for </pre>, tag will be 'pre'
5.1471 + # Reconstruct the original end tag.
5.1472 + if tag not in self.elements_no_end_tag:
5.1473 + self.pieces.append("</%(tag)s>" % locals())
5.1474 +
5.1475 + def handle_charref(self, ref):
5.1476 + # called for each character reference, e.g. for ' ', ref will be '160'
5.1477 + # Reconstruct the original character reference.
5.1478 + self.pieces.append('&#%(ref)s;' % locals())
5.1479 +
5.1480 + def handle_entityref(self, ref):
5.1481 + # called for each entity reference, e.g. for '©', ref will be 'copy'
5.1482 + # Reconstruct the original entity reference.
5.1483 + self.pieces.append('&%(ref)s;' % locals())
5.1484 +
5.1485 + def handle_data(self, text):
5.1486 + # called for each block of plain text, i.e. outside of any tag and
5.1487 + # not containing any character or entity references
5.1488 + # Store the original text verbatim.
5.1489 + if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
5.1490 + self.pieces.append(text)
5.1491 +
5.1492 + def handle_comment(self, text):
5.1493 + # called for each HTML comment, e.g. <!-- insert Javascript code here -->
5.1494 + # Reconstruct the original comment.
5.1495 + self.pieces.append('<!--%(text)s-->' % locals())
5.1496 +
5.1497 + def handle_pi(self, text):
5.1498 + # called for each processing instruction, e.g. <?instruction>
5.1499 + # Reconstruct original processing instruction.
5.1500 + self.pieces.append('<?%(text)s>' % locals())
5.1501 +
5.1502 + def handle_decl(self, text):
5.1503 + # called for the DOCTYPE, if present, e.g.
5.1504 + # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
5.1505 + # "http://www.w3.org/TR/html4/loose.dtd">
5.1506 + # Reconstruct original DOCTYPE
5.1507 + self.pieces.append('<!%(text)s>' % locals())
5.1508 +
5.1509 + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
5.1510 + def _scan_name(self, i, declstartpos):
5.1511 + rawdata = self.rawdata
5.1512 + n = len(rawdata)
5.1513 + if i == n:
5.1514 + return None, -1
5.1515 + m = self._new_declname_match(rawdata, i)
5.1516 + if m:
5.1517 + s = m.group()
5.1518 + name = s.strip()
5.1519 + if (i + len(s)) == n:
5.1520 + return None, -1 # end of buffer
5.1521 + return name.lower(), m.end()
5.1522 + else:
5.1523 + self.handle_data(rawdata)
5.1524 +# self.updatepos(declstartpos, i)
5.1525 + return None, -1
5.1526 +
5.1527 + def output(self):
5.1528 + '''Return processed HTML as a single string'''
5.1529 + return ''.join([str(p) for p in self.pieces])
5.1530 +
5.1531 +class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
5.1532 + def __init__(self, baseuri, baselang, encoding):
5.1533 + sgmllib.SGMLParser.__init__(self)
5.1534 + _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
5.1535 +
5.1536 + def decodeEntities(self, element, data):
5.1537 + data = data.replace('<', '<')
5.1538 + data = data.replace('<', '<')
5.1539 + data = data.replace('>', '>')
5.1540 + data = data.replace('>', '>')
5.1541 + data = data.replace('&', '&')
5.1542 + data = data.replace('&', '&')
5.1543 + data = data.replace('"', '"')
5.1544 + data = data.replace('"', '"')
5.1545 + data = data.replace(''', ''')
5.1546 + data = data.replace(''', ''')
5.1547 + if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
5.1548 + data = data.replace('<', '<')
5.1549 + data = data.replace('>', '>')
5.1550 + data = data.replace('&', '&')
5.1551 + data = data.replace('"', '"')
5.1552 + data = data.replace(''', "'")
5.1553 + return data
5.1554 +
5.1555 +class _RelativeURIResolver(_BaseHTMLProcessor):
5.1556 + relative_uris = [('a', 'href'),
5.1557 + ('applet', 'codebase'),
5.1558 + ('area', 'href'),
5.1559 + ('blockquote', 'cite'),
5.1560 + ('body', 'background'),
5.1561 + ('del', 'cite'),
5.1562 + ('form', 'action'),
5.1563 + ('frame', 'longdesc'),
5.1564 + ('frame', 'src'),
5.1565 + ('iframe', 'longdesc'),
5.1566 + ('iframe', 'src'),
5.1567 + ('head', 'profile'),
5.1568 + ('img', 'longdesc'),
5.1569 + ('img', 'src'),
5.1570 + ('img', 'usemap'),
5.1571 + ('input', 'src'),
5.1572 + ('input', 'usemap'),
5.1573 + ('ins', 'cite'),
5.1574 + ('link', 'href'),
5.1575 + ('object', 'classid'),
5.1576 + ('object', 'codebase'),
5.1577 + ('object', 'data'),
5.1578 + ('object', 'usemap'),
5.1579 + ('q', 'cite'),
5.1580 + ('script', 'src')]
5.1581 +
5.1582 + def __init__(self, baseuri, encoding):
5.1583 + _BaseHTMLProcessor.__init__(self, encoding)
5.1584 + self.baseuri = baseuri
5.1585 +
5.1586 + def resolveURI(self, uri):
5.1587 + return _urljoin(self.baseuri, uri)
5.1588 +
5.1589 + def unknown_starttag(self, tag, attrs):
5.1590 + attrs = self.normalize_attrs(attrs)
5.1591 + attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
5.1592 + _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
5.1593 +
5.1594 +def _resolveRelativeURIs(htmlSource, baseURI, encoding):
5.1595 + if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
5.1596 + p = _RelativeURIResolver(baseURI, encoding)
5.1597 + p.feed(htmlSource)
5.1598 + return p.output()
5.1599 +
5.1600 +class _HTMLSanitizer(_BaseHTMLProcessor):
5.1601 + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
5.1602 + 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
5.1603 + 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
5.1604 + 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
5.1605 + 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
5.1606 + 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
5.1607 + 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
5.1608 + 'thead', 'tr', 'tt', 'u', 'ul', 'var']
5.1609 +
5.1610 + acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
5.1611 + 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
5.1612 + 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
5.1613 + 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
5.1614 + 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
5.1615 + 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
5.1616 + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
5.1617 + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
5.1618 + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
5.1619 + 'usemap', 'valign', 'value', 'vspace', 'width']
5.1620 +
5.1621 + unacceptable_elements_with_end_tag = ['script', 'applet']
5.1622 +
5.1623 + def reset(self):
5.1624 + _BaseHTMLProcessor.reset(self)
5.1625 + self.unacceptablestack = 0
5.1626 +
5.1627 + def unknown_starttag(self, tag, attrs):
5.1628 + if not tag in self.acceptable_elements:
5.1629 + if tag in self.unacceptable_elements_with_end_tag:
5.1630 + self.unacceptablestack += 1
5.1631 + return
5.1632 + attrs = self.normalize_attrs(attrs)
5.1633 + attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
5.1634 + _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
5.1635 +
5.1636 + def unknown_endtag(self, tag):
5.1637 + if not tag in self.acceptable_elements:
5.1638 + if tag in self.unacceptable_elements_with_end_tag:
5.1639 + self.unacceptablestack -= 1
5.1640 + return
5.1641 + _BaseHTMLProcessor.unknown_endtag(self, tag)
5.1642 +
5.1643 + def handle_pi(self, text):
5.1644 + pass
5.1645 +
5.1646 + def handle_decl(self, text):
5.1647 + pass
5.1648 +
5.1649 + def handle_data(self, text):
5.1650 + if not self.unacceptablestack:
5.1651 + _BaseHTMLProcessor.handle_data(self, text)
5.1652 +
5.1653 +def _sanitizeHTML(htmlSource, encoding):
5.1654 + p = _HTMLSanitizer(encoding)
5.1655 + p.feed(htmlSource)
5.1656 + data = p.output()
5.1657 + if TIDY_MARKUP:
5.1658 + # loop through list of preferred Tidy interfaces looking for one that's installed,
5.1659 + # then set up a common _tidy function to wrap the interface-specific API.
5.1660 + _tidy = None
5.1661 + for tidy_interface in PREFERRED_TIDY_INTERFACES:
5.1662 + try:
5.1663 + if tidy_interface == "uTidy":
5.1664 + from tidy import parseString as _utidy
5.1665 + def _tidy(data, **kwargs):
5.1666 + return str(_utidy(data, **kwargs))
5.1667 + break
5.1668 + elif tidy_interface == "mxTidy":
5.1669 + from mx.Tidy import Tidy as _mxtidy
5.1670 + def _tidy(data, **kwargs):
5.1671 + nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
5.1672 + return data
5.1673 + break
5.1674 + except:
5.1675 + pass
5.1676 + if _tidy:
5.1677 + utf8 = type(data) == type(u'')
5.1678 + if utf8:
5.1679 + data = data.encode('utf-8')
5.1680 + data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
5.1681 + if utf8:
5.1682 + data = unicode(data, 'utf-8')
5.1683 + if data.count('<body'):
5.1684 + data = data.split('<body', 1)[1]
5.1685 + if data.count('>'):
5.1686 + data = data.split('>', 1)[1]
5.1687 + if data.count('</body'):
5.1688 + data = data.split('</body', 1)[0]
5.1689 + data = data.strip().replace('\r\n', '\n')
5.1690 + return data
5.1691 +
5.1692 +class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
5.1693 + def http_error_default(self, req, fp, code, msg, headers):
5.1694 + if ((code / 100) == 3) and (code != 304):
5.1695 + return self.http_error_302(req, fp, code, msg, headers)
5.1696 + infourl = urllib.addinfourl(fp, headers, req.get_full_url())
5.1697 + infourl.status = code
5.1698 + return infourl
5.1699 +
5.1700 + def http_error_302(self, req, fp, code, msg, headers):
5.1701 + if headers.dict.has_key('location'):
5.1702 + infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
5.1703 + else:
5.1704 + infourl = urllib.addinfourl(fp, headers, req.get_full_url())
5.1705 + if not hasattr(infourl, 'status'):
5.1706 + infourl.status = code
5.1707 + return infourl
5.1708 +
5.1709 + def http_error_301(self, req, fp, code, msg, headers):
5.1710 + if headers.dict.has_key('location'):
5.1711 + infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
5.1712 + else:
5.1713 + infourl = urllib.addinfourl(fp, headers, req.get_full_url())
5.1714 + if not hasattr(infourl, 'status'):
5.1715 + infourl.status = code
5.1716 + return infourl
5.1717 +
5.1718 + http_error_300 = http_error_302
5.1719 + http_error_303 = http_error_302
5.1720 + http_error_307 = http_error_302
5.1721 +
5.1722 + def http_error_401(self, req, fp, code, msg, headers):
5.1723 + # Check if
5.1724 + # - server requires digest auth, AND
5.1725 + # - we tried (unsuccessfully) with basic auth, AND
5.1726 + # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
5.1727 + # If all conditions hold, parse authentication information
5.1728 + # out of the Authorization header we sent the first time
5.1729 + # (for the username and password) and the WWW-Authenticate
5.1730 + # header the server sent back (for the realm) and retry
5.1731 + # the request with the appropriate digest auth headers instead.
5.1732 + # This evil genius hack has been brought to you by Aaron Swartz.
5.1733 + host = urlparse.urlparse(req.get_full_url())[1]
5.1734 + try:
5.1735 + assert sys.version.split()[0] >= '2.3.3'
5.1736 + assert base64 != None
5.1737 + user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
5.1738 + realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
5.1739 + self.add_password(realm, host, user, passw)
5.1740 + retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
5.1741 + self.reset_retry_count()
5.1742 + return retry
5.1743 + except:
5.1744 + return self.http_error_default(req, fp, code, msg, headers)
5.1745 +
5.1746 +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
5.1747 + """URL, filename, or string --> stream
5.1748 +
5.1749 + This function lets you define parsers that take any input source
5.1750 + (URL, pathname to local or network file, or actual data as a string)
5.1751 + and deal with it in a uniform manner. Returned object is guaranteed
5.1752 + to have all the basic stdio read methods (read, readline, readlines).
5.1753 + Just .close() the object when you're done with it.
5.1754 +
5.1755 + If the etag argument is supplied, it will be used as the value of an
5.1756 + If-None-Match request header.
5.1757 +
5.1758 + If the modified argument is supplied, it must be a tuple of 9 integers
5.1759 + as returned by gmtime() in the standard Python time module. This MUST
5.1760 + be in GMT (Greenwich Mean Time). The formatted date/time will be used
5.1761 + as the value of an If-Modified-Since request header.
5.1762 +
5.1763 + If the agent argument is supplied, it will be used as the value of a
5.1764 + User-Agent request header.
5.1765 +
5.1766 + If the referrer argument is supplied, it will be used as the value of a
5.1767 + Referer[sic] request header.
5.1768 +
5.1769 + If handlers is supplied, it is a list of handlers used to build a
5.1770 + urllib2 opener.
5.1771 + """
5.1772 +
5.1773 + if hasattr(url_file_stream_or_string, 'read'):
5.1774 + return url_file_stream_or_string
5.1775 +
5.1776 + if url_file_stream_or_string == '-':
5.1777 + return sys.stdin
5.1778 +
5.1779 + if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
5.1780 + if not agent:
5.1781 + agent = USER_AGENT
5.1782 + # test for inline user:password for basic auth
5.1783 + auth = None
5.1784 + if base64:
5.1785 + urltype, rest = urllib.splittype(url_file_stream_or_string)
5.1786 + realhost, rest = urllib.splithost(rest)
5.1787 + if realhost:
5.1788 + user_passwd, realhost = urllib.splituser(realhost)
5.1789 + if user_passwd:
5.1790 + url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
5.1791 + auth = base64.encodestring(user_passwd).strip()
5.1792 + # try to open with urllib2 (to use optional headers)
5.1793 + request = urllib2.Request(url_file_stream_or_string)
5.1794 + request.add_header('User-Agent', agent)
5.1795 + if etag:
5.1796 + request.add_header('If-None-Match', etag)
5.1797 + if modified:
5.1798 + # format into an RFC 1123-compliant timestamp. We can't use
5.1799 + # time.strftime() since the %a and %b directives can be affected
5.1800 + # by the current locale, but RFC 2616 states that dates must be
5.1801 + # in English.
5.1802 + short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
5.1803 + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
5.1804 + request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
5.1805 + if referrer:
5.1806 + request.add_header('Referer', referrer)
5.1807 + if gzip and zlib:
5.1808 + request.add_header('Accept-encoding', 'gzip, deflate')
5.1809 + elif gzip:
5.1810 + request.add_header('Accept-encoding', 'gzip')
5.1811 + elif zlib:
5.1812 + request.add_header('Accept-encoding', 'deflate')
5.1813 + else:
5.1814 + request.add_header('Accept-encoding', '')
5.1815 + if auth:
5.1816 + request.add_header('Authorization', 'Basic %s' % auth)
5.1817 + if ACCEPT_HEADER:
5.1818 + request.add_header('Accept', ACCEPT_HEADER)
5.1819 + request.add_header('A-IM', 'feed') # RFC 3229 support
5.1820 + opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
5.1821 + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
5.1822 + try:
5.1823 + return opener.open(request)
5.1824 + finally:
5.1825 + opener.close() # JohnD
5.1826 +
5.1827 + # try to open with native open function (if url_file_stream_or_string is a filename)
5.1828 + try:
5.1829 + return open(url_file_stream_or_string)
5.1830 + except:
5.1831 + pass
5.1832 +
5.1833 + # treat url_file_stream_or_string as string
5.1834 + return _StringIO(str(url_file_stream_or_string))
5.1835 +
5.1836 +_date_handlers = []
5.1837 +def registerDateHandler(func):
5.1838 + '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
5.1839 + _date_handlers.insert(0, func)
5.1840 +
5.1841 +# ISO-8601 date parsing routines written by Fazal Majid.
5.1842 +# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
5.1843 +# parser is beyond the scope of feedparser and would be a worthwhile addition
5.1844 +# to the Python library.
5.1845 +# A single regular expression cannot parse ISO 8601 date formats into groups
5.1846 +# as the standard is highly irregular (for instance is 030104 2003-01-04 or
5.1847 +# 0301-04-01), so we use templates instead.
5.1848 +# Please note the order in templates is significant because we need a
5.1849 +# greedy match.
5.1850 +_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
5.1851 + 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
5.1852 + '-YY-?MM', '-OOO', '-YY',
5.1853 + '--MM-?DD', '--MM',
5.1854 + '---DD',
5.1855 + 'CC', '']
5.1856 +_iso8601_re = [
5.1857 + tmpl.replace(
5.1858 + 'YYYY', r'(?P<year>\d{4})').replace(
5.1859 + 'YY', r'(?P<year>\d\d)').replace(
5.1860 + 'MM', r'(?P<month>[01]\d)').replace(
5.1861 + 'DD', r'(?P<day>[0123]\d)').replace(
5.1862 + 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
5.1863 + 'CC', r'(?P<century>\d\d$)')
5.1864 + + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
5.1865 + + r'(:(?P<second>\d{2}))?'
5.1866 + + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
5.1867 + for tmpl in _iso8601_tmpl]
5.1868 +del tmpl
5.1869 +_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
5.1870 +del regex
5.1871 +def _parse_date_iso8601(dateString):
5.1872 + '''Parse a variety of ISO-8601-compatible formats like 20040105'''
5.1873 + m = None
5.1874 + for _iso8601_match in _iso8601_matches:
5.1875 + m = _iso8601_match(dateString)
5.1876 + if m: break
5.1877 + if not m: return
5.1878 + if m.span() == (0, 0): return
5.1879 + params = m.groupdict()
5.1880 + ordinal = params.get('ordinal', 0)
5.1881 + if ordinal:
5.1882 + ordinal = int(ordinal)
5.1883 + else:
5.1884 + ordinal = 0
5.1885 + year = params.get('year', '--')
5.1886 + if not year or year == '--':
5.1887 + year = time.gmtime()[0]
5.1888 + elif len(year) == 2:
5.1889 + # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
5.1890 + year = 100 * int(time.gmtime()[0] / 100) + int(year)
5.1891 + else:
5.1892 + year = int(year)
5.1893 + month = params.get('month', '-')
5.1894 + if not month or month == '-':
5.1895 + # ordinals are NOT normalized by mktime, we simulate them
5.1896 + # by setting month=1, day=ordinal
5.1897 + if ordinal:
5.1898 + month = 1
5.1899 + else:
5.1900 + month = time.gmtime()[1]
5.1901 + month = int(month)
5.1902 + day = params.get('day', 0)
5.1903 + if not day:
5.1904 + # see above
5.1905 + if ordinal:
5.1906 + day = ordinal
5.1907 + elif params.get('century', 0) or \
5.1908 + params.get('year', 0) or params.get('month', 0):
5.1909 + day = 1
5.1910 + else:
5.1911 + day = time.gmtime()[2]
5.1912 + else:
5.1913 + day = int(day)
5.1914 + # special case of the century - is the first year of the 21st century
5.1915 + # 2000 or 2001 ? The debate goes on...
5.1916 + if 'century' in params.keys():
5.1917 + year = (int(params['century']) - 1) * 100 + 1
5.1918 + # in ISO 8601 most fields are optional
5.1919 + for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
5.1920 + if not params.get(field, None):
5.1921 + params[field] = 0
5.1922 + hour = int(params.get('hour', 0))
5.1923 + minute = int(params.get('minute', 0))
5.1924 + second = int(params.get('second', 0))
5.1925 + # weekday is normalized by mktime(), we can ignore it
5.1926 + weekday = 0
5.1927 + # daylight savings is complex, but not needed for feedparser's purposes
5.1928 + # as time zones, if specified, include mention of whether it is active
5.1929 + # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
5.1930 + # and most implementations have DST bugs
5.1931 + daylight_savings_flag = 0
5.1932 + tm = [year, month, day, hour, minute, second, weekday,
5.1933 + ordinal, daylight_savings_flag]
5.1934 + # ISO 8601 time zone adjustments
5.1935 + tz = params.get('tz')
5.1936 + if tz and tz != 'Z':
5.1937 + if tz[0] == '-':
5.1938 + tm[3] += int(params.get('tzhour', 0))
5.1939 + tm[4] += int(params.get('tzmin', 0))
5.1940 + elif tz[0] == '+':
5.1941 + tm[3] -= int(params.get('tzhour', 0))
5.1942 + tm[4] -= int(params.get('tzmin', 0))
5.1943 + else:
5.1944 + return None
5.1945 + # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
5.1946 + # which is guaranteed to normalize d/m/y/h/m/s.
5.1947 + # Many implementations have bugs, but we'll pretend they don't.
5.1948 + return time.localtime(time.mktime(tm))
5.1949 +registerDateHandler(_parse_date_iso8601)
5.1950 +
5.1951 +# 8-bit date handling routines written by ytrewq1.
5.1952 +_korean_year = u'\ub144' # b3e2 in euc-kr
5.1953 +_korean_month = u'\uc6d4' # bff9 in euc-kr
5.1954 +_korean_day = u'\uc77c' # c0cf in euc-kr
5.1955 +_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
5.1956 +_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
5.1957 +
5.1958 +_korean_onblog_date_re = \
5.1959 + re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
5.1960 + (_korean_year, _korean_month, _korean_day))
5.1961 +_korean_nate_date_re = \
5.1962 + re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
5.1963 + (_korean_am, _korean_pm))
5.1964 +def _parse_date_onblog(dateString):
5.1965 + '''Parse a string according to the OnBlog 8-bit date format'''
5.1966 + m = _korean_onblog_date_re.match(dateString)
5.1967 + if not m: return
5.1968 + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
5.1969 + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
5.1970 + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
5.1971 + 'zonediff': '+09:00'}
5.1972 + if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
5.1973 + return _parse_date_w3dtf(w3dtfdate)
5.1974 +registerDateHandler(_parse_date_onblog)
5.1975 +
5.1976 +def _parse_date_nate(dateString):
5.1977 + '''Parse a string according to the Nate 8-bit date format'''
5.1978 + m = _korean_nate_date_re.match(dateString)
5.1979 + if not m: return
5.1980 + hour = int(m.group(5))
5.1981 + ampm = m.group(4)
5.1982 + if (ampm == _korean_pm):
5.1983 + hour += 12
5.1984 + hour = str(hour)
5.1985 + if len(hour) == 1:
5.1986 + hour = '0' + hour
5.1987 + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
5.1988 + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
5.1989 + 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
5.1990 + 'zonediff': '+09:00'}
5.1991 + if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
5.1992 + return _parse_date_w3dtf(w3dtfdate)
5.1993 +registerDateHandler(_parse_date_nate)
5.1994 +
5.1995 +_mssql_date_re = \
5.1996 + re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
5.1997 +def _parse_date_mssql(dateString):
5.1998 + '''Parse a string according to the MS SQL date format'''
5.1999 + m = _mssql_date_re.match(dateString)
5.2000 + if not m: return
5.2001 + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
5.2002 + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
5.2003 + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
5.2004 + 'zonediff': '+09:00'}
5.2005 + if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
5.2006 + return _parse_date_w3dtf(w3dtfdate)
5.2007 +registerDateHandler(_parse_date_mssql)
5.2008 +
5.2009 +# Unicode strings for Greek date strings
5.2010 +_greek_months = \
5.2011 + { \
5.2012 + u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
5.2013 + u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
5.2014 + u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
5.2015 + u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
5.2016 + u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
5.2017 + u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
5.2018 + u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
5.2019 + u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
5.2020 + u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
5.2021 + u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
5.2022 + u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
5.2023 + u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
5.2024 + u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
5.2025 + u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
5.2026 + u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
5.2027 + u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
5.2028 + u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
5.2029 + u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
5.2030 + u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
5.2031 + }
5.2032 +
5.2033 +_greek_wdays = \
5.2034 + { \
5.2035 + u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
5.2036 + u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
5.2037 + u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
5.2038 + u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
5.2039 + u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
5.2040 + u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
5.2041 + u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
5.2042 + }
5.2043 +
5.2044 +_greek_date_format_re = \
5.2045 + re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
5.2046 +
5.2047 +def _parse_date_greek(dateString):
5.2048 + '''Parse a string according to a Greek 8-bit date format.'''
5.2049 + m = _greek_date_format_re.match(dateString)
5.2050 + if not m: return
5.2051 + try:
5.2052 + wday = _greek_wdays[m.group(1)]
5.2053 + month = _greek_months[m.group(3)]
5.2054 + except:
5.2055 + return
5.2056 + rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
5.2057 + {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
5.2058 + 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
5.2059 + 'zonediff': m.group(8)}
5.2060 + if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
5.2061 + return _parse_date_rfc822(rfc822date)
5.2062 +registerDateHandler(_parse_date_greek)
5.2063 +
5.2064 +# Unicode strings for Hungarian date strings
5.2065 +_hungarian_months = \
5.2066 + { \
5.2067 + u'janu\u00e1r': u'01', # e1 in iso-8859-2
5.2068 + u'febru\u00e1ri': u'02', # e1 in iso-8859-2
5.2069 + u'm\u00e1rcius': u'03', # e1 in iso-8859-2
5.2070 + u'\u00e1prilis': u'04', # e1 in iso-8859-2
5.2071 + u'm\u00e1ujus': u'05', # e1 in iso-8859-2
5.2072 + u'j\u00fanius': u'06', # fa in iso-8859-2
5.2073 + u'j\u00falius': u'07', # fa in iso-8859-2
5.2074 + u'augusztus': u'08',
5.2075 + u'szeptember': u'09',
5.2076 + u'okt\u00f3ber': u'10', # f3 in iso-8859-2
5.2077 + u'november': u'11',
5.2078 + u'december': u'12',
5.2079 + }
5.2080 +
5.2081 +_hungarian_date_format_re = \
5.2082 + re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
5.2083 +
5.2084 +def _parse_date_hungarian(dateString):
5.2085 + '''Parse a string according to a Hungarian 8-bit date format.'''
5.2086 + m = _hungarian_date_format_re.match(dateString)
5.2087 + if not m: return
5.2088 + try:
5.2089 + month = _hungarian_months[m.group(2)]
5.2090 + day = m.group(3)
5.2091 + if len(day) == 1:
5.2092 + day = '0' + day
5.2093 + hour = m.group(4)
5.2094 + if len(hour) == 1:
5.2095 + hour = '0' + hour
5.2096 + except:
5.2097 + return
5.2098 + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
5.2099 + {'year': m.group(1), 'month': month, 'day': day,\
5.2100 + 'hour': hour, 'minute': m.group(5),\
5.2101 + 'zonediff': m.group(6)}
5.2102 + if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
5.2103 + return _parse_date_w3dtf(w3dtfdate)
5.2104 +registerDateHandler(_parse_date_hungarian)
5.2105 +
5.2106 +# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
5.2107 +# Drake and licensed under the Python license. Removed all range checking
5.2108 +# for month, day, hour, minute, and second, since mktime will normalize
5.2109 +# these later
5.2110 +def _parse_date_w3dtf(dateString):
5.2111 + def __extract_date(m):
5.2112 + year = int(m.group('year'))
5.2113 + if year < 100:
5.2114 + year = 100 * int(time.gmtime()[0] / 100) + int(year)
5.2115 + if year < 1000:
5.2116 + return 0, 0, 0
5.2117 + julian = m.group('julian')
5.2118 + if julian:
5.2119 + julian = int(julian)
5.2120 + month = julian / 30 + 1
5.2121 + day = julian % 30 + 1
5.2122 + jday = None
5.2123 + while jday != julian:
5.2124 + t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
5.2125 + jday = time.gmtime(t)[-2]
5.2126 + diff = abs(jday - julian)
5.2127 + if jday > julian:
5.2128 + if diff < day:
5.2129 + day = day - diff
5.2130 + else:
5.2131 + month = month - 1
5.2132 + day = 31
5.2133 + elif jday < julian:
5.2134 + if day + diff < 28:
5.2135 + day = day + diff
5.2136 + else:
5.2137 + month = month + 1
5.2138 + return year, month, day
5.2139 + month = m.group('month')
5.2140 + day = 1
5.2141 + if month is None:
5.2142 + month = 1
5.2143 + else:
5.2144 + month = int(month)
5.2145 + day = m.group('day')
5.2146 + if day:
5.2147 + day = int(day)
5.2148 + else:
5.2149 + day = 1
5.2150 + return year, month, day
5.2151 +
5.2152 + def __extract_time(m):
5.2153 + if not m:
5.2154 + return 0, 0, 0
5.2155 + hours = m.group('hours')
5.2156 + if not hours:
5.2157 + return 0, 0, 0
5.2158 + hours = int(hours)
5.2159 + minutes = int(m.group('minutes'))
5.2160 + seconds = m.group('seconds')
5.2161 + if seconds:
5.2162 + seconds = int(seconds)
5.2163 + else:
5.2164 + seconds = 0
5.2165 + return hours, minutes, seconds
5.2166 +
5.2167 + def __extract_tzd(m):
5.2168 + '''Return the Time Zone Designator as an offset in seconds from UTC.'''
5.2169 + if not m:
5.2170 + return 0
5.2171 + tzd = m.group('tzd')
5.2172 + if not tzd:
5.2173 + return 0
5.2174 + if tzd == 'Z':
5.2175 + return 0
5.2176 + hours = int(m.group('tzdhours'))
5.2177 + minutes = m.group('tzdminutes')
5.2178 + if minutes:
5.2179 + minutes = int(minutes)
5.2180 + else:
5.2181 + minutes = 0
5.2182 + offset = (hours*60 + minutes) * 60
5.2183 + if tzd[0] == '+':
5.2184 + return -offset
5.2185 + return offset
5.2186 +
5.2187 + __date_re = ('(?P<year>\d\d\d\d)'
5.2188 + '(?:(?P<dsep>-|)'
5.2189 + '(?:(?P<julian>\d\d\d)'
5.2190 + '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
5.2191 + __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
5.2192 + __tzd_rx = re.compile(__tzd_re)
5.2193 + __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
5.2194 + '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
5.2195 + + __tzd_re)
5.2196 + __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
5.2197 + __datetime_rx = re.compile(__datetime_re)
5.2198 + m = __datetime_rx.match(dateString)
5.2199 + if (m is None) or (m.group() != dateString): return
5.2200 + gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
5.2201 + if gmt[0] == 0: return
5.2202 + return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
5.2203 +registerDateHandler(_parse_date_w3dtf)
5.2204 +
5.2205 +def _parse_date_rfc822(dateString):
5.2206 + '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
5.2207 + data = dateString.split()
5.2208 + if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
5.2209 + del data[0]
5.2210 + if len(data) == 4:
5.2211 + s = data[3]
5.2212 + i = s.find('+')
5.2213 + if i > 0:
5.2214 + data[3:] = [s[:i], s[i+1:]]
5.2215 + else:
5.2216 + data.append('')
5.2217 + dateString = " ".join(data)
5.2218 + if len(data) < 5:
5.2219 + dateString += ' 00:00:00 GMT'
5.2220 + tm = rfc822.parsedate_tz(dateString)
5.2221 + if tm:
5.2222 + return time.gmtime(rfc822.mktime_tz(tm))
5.2223 +# rfc822.py defines several time zones, but we define some extra ones.
5.2224 +# 'ET' is equivalent to 'EST', etc.
5.2225 +_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
5.2226 +rfc822._timezones.update(_additional_timezones)
5.2227 +registerDateHandler(_parse_date_rfc822)
5.2228 +
5.2229 +def _parse_date(dateString):
5.2230 + '''Parses a variety of date formats into a 9-tuple in GMT'''
5.2231 + for handler in _date_handlers:
5.2232 + try:
5.2233 + date9tuple = handler(dateString)
5.2234 + if not date9tuple: continue
5.2235 + if len(date9tuple) != 9:
5.2236 + if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
5.2237 + raise ValueError
5.2238 + map(int, date9tuple)
5.2239 + return date9tuple
5.2240 + except Exception, e:
5.2241 + if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
5.2242 + pass
5.2243 + return None
5.2244 +
5.2245 +def _getCharacterEncoding(http_headers, xml_data):
5.2246 + '''Get the character encoding of the XML document
5.2247 +
5.2248 + http_headers is a dictionary
5.2249 + xml_data is a raw string (not Unicode)
5.2250 +
5.2251 + This is so much trickier than it sounds, it's not even funny.
5.2252 + According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
5.2253 + is application/xml, application/*+xml,
5.2254 + application/xml-external-parsed-entity, or application/xml-dtd,
5.2255 + the encoding given in the charset parameter of the HTTP Content-Type
5.2256 + takes precedence over the encoding given in the XML prefix within the
5.2257 + document, and defaults to 'utf-8' if neither are specified. But, if
5.2258 + the HTTP Content-Type is text/xml, text/*+xml, or
5.2259 + text/xml-external-parsed-entity, the encoding given in the XML prefix
5.2260 + within the document is ALWAYS IGNORED and only the encoding given in
5.2261 + the charset parameter of the HTTP Content-Type header should be
5.2262 + respected, and it defaults to 'us-ascii' if not specified.
5.2263 +
5.2264 + Furthermore, discussion on the atom-syntax mailing list with the
5.2265 + author of RFC 3023 leads me to the conclusion that any document
5.2266 + served with a Content-Type of text/* and no charset parameter
5.2267 + must be treated as us-ascii. (We now do this.) And also that it
5.2268 + must always be flagged as non-well-formed. (We now do this too.)
5.2269 +
5.2270 + If Content-Type is unspecified (input was local file or non-HTTP source)
5.2271 + or unrecognized (server just got it totally wrong), then go by the
5.2272 + encoding given in the XML prefix of the document and default to
5.2273 + 'iso-8859-1' as per the HTTP specification (RFC 2616).
5.2274 +
5.2275 + Then, assuming we didn't find a character encoding in the HTTP headers
5.2276 + (and the HTTP Content-type allowed us to look in the body), we need
5.2277 + to sniff the first few bytes of the XML data and try to determine
5.2278 + whether the encoding is ASCII-compatible. Section F of the XML
5.2279 + specification shows the way here:
5.2280 + http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
5.2281 +
5.2282 + If the sniffed encoding is not ASCII-compatible, we need to make it
5.2283 + ASCII compatible so that we can sniff further into the XML declaration
5.2284 + to find the encoding attribute, which will tell us the true encoding.
5.2285 +
5.2286 + Of course, none of this guarantees that we will be able to parse the
5.2287 + feed in the declared character encoding (assuming it was declared
5.2288 + correctly, which many are not). CJKCodecs and iconv_codec help a lot;
5.2289 + you should definitely install them if you can.
5.2290 + http://cjkpython.i18n.org/
5.2291 + '''
5.2292 +
5.2293 + def _parseHTTPContentType(content_type):
5.2294 + '''takes HTTP Content-Type header and returns (content type, charset)
5.2295 +
5.2296 + If no charset is specified, returns (content type, '')
5.2297 + If no content type is specified, returns ('', '')
5.2298 + Both return parameters are guaranteed to be lowercase strings
5.2299 + '''
5.2300 + content_type = content_type or ''
5.2301 + content_type, params = cgi.parse_header(content_type)
5.2302 + return content_type, params.get('charset', '').replace("'", '')
5.2303 +
5.2304 + sniffed_xml_encoding = ''
5.2305 + xml_encoding = ''
5.2306 + true_encoding = ''
5.2307 + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
5.2308 + # Must sniff for non-ASCII-compatible character encodings before
5.2309 + # searching for XML declaration. This heuristic is defined in
5.2310 + # section F of the XML specification:
5.2311 + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
5.2312 + try:
5.2313 + if xml_data[:4] == '\x4c\x6f\xa7\x94':
5.2314 + # EBCDIC
5.2315 + xml_data = _ebcdic_to_ascii(xml_data)
5.2316 + elif xml_data[:4] == '\x00\x3c\x00\x3f':
5.2317 + # UTF-16BE
5.2318 + sniffed_xml_encoding = 'utf-16be'
5.2319 + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
5.2320 + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
5.2321 + # UTF-16BE with BOM
5.2322 + sniffed_xml_encoding = 'utf-16be'
5.2323 + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
5.2324 + elif xml_data[:4] == '\x3c\x00\x3f\x00':
5.2325 + # UTF-16LE
5.2326 + sniffed_xml_encoding = 'utf-16le'
5.2327 + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
5.2328 + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
5.2329 + # UTF-16LE with BOM
5.2330 + sniffed_xml_encoding = 'utf-16le'
5.2331 + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
5.2332 + elif xml_data[:4] == '\x00\x00\x00\x3c':
5.2333 + # UTF-32BE
5.2334 + sniffed_xml_encoding = 'utf-32be'
5.2335 + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
5.2336 + elif xml_data[:4] == '\x3c\x00\x00\x00':
5.2337 + # UTF-32LE
5.2338 + sniffed_xml_encoding = 'utf-32le'
5.2339 + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
5.2340 + elif xml_data[:4] == '\x00\x00\xfe\xff':
5.2341 + # UTF-32BE with BOM
5.2342 + sniffed_xml_encoding = 'utf-32be'
5.2343 + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
5.2344 + elif xml_data[:4] == '\xff\xfe\x00\x00':
5.2345 + # UTF-32LE with BOM
5.2346 + sniffed_xml_encoding = 'utf-32le'
5.2347 + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
5.2348 + elif xml_data[:3] == '\xef\xbb\xbf':
5.2349 + # UTF-8 with BOM
5.2350 + sniffed_xml_encoding = 'utf-8'
5.2351 + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
5.2352 + else:
5.2353 + # ASCII-compatible
5.2354 + pass
5.2355 + xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
5.2356 + except:
5.2357 + xml_encoding_match = None
5.2358 + if xml_encoding_match:
5.2359 + xml_encoding = xml_encoding_match.groups()[0].lower()
5.2360 + if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
5.2361 + xml_encoding = sniffed_xml_encoding
5.2362 + acceptable_content_type = 0
5.2363 + application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
5.2364 + text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
5.2365 + if (http_content_type in application_content_types) or \
5.2366 + (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
5.2367 + acceptable_content_type = 1
5.2368 + true_encoding = http_encoding or xml_encoding or 'utf-8'
5.2369 + elif (http_content_type in text_content_types) or \
5.2370 + (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
5.2371 + acceptable_content_type = 1
5.2372 + true_encoding = http_encoding or 'us-ascii'
5.2373 + elif http_content_type.startswith('text/'):
5.2374 + true_encoding = http_encoding or 'us-ascii'
5.2375 + elif http_headers and (not http_headers.has_key('content-type')):
5.2376 + true_encoding = xml_encoding or 'iso-8859-1'
5.2377 + else:
5.2378 + true_encoding = xml_encoding or 'utf-8'
5.2379 + return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
5.2380 +
5.2381 +def _toUTF8(data, encoding):
5.2382 + '''Changes an XML data stream on the fly to specify a new encoding
5.2383 +
5.2384 + data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
5.2385 + encoding is a string recognized by encodings.aliases
5.2386 + '''
5.2387 + if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
5.2388 + # strip Byte Order Mark (if present)
5.2389 + if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
5.2390 + if _debug:
5.2391 + sys.stderr.write('stripping BOM\n')
5.2392 + if encoding != 'utf-16be':
5.2393 + sys.stderr.write('trying utf-16be instead\n')
5.2394 + encoding = 'utf-16be'
5.2395 + data = data[2:]
5.2396 + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
5.2397 + if _debug:
5.2398 + sys.stderr.write('stripping BOM\n')
5.2399 + if encoding != 'utf-16le':
5.2400 + sys.stderr.write('trying utf-16le instead\n')
5.2401 + encoding = 'utf-16le'
5.2402 + data = data[2:]
5.2403 + elif data[:3] == '\xef\xbb\xbf':
5.2404 + if _debug:
5.2405 + sys.stderr.write('stripping BOM\n')
5.2406 + if encoding != 'utf-8':
5.2407 + sys.stderr.write('trying utf-8 instead\n')
5.2408 + encoding = 'utf-8'
5.2409 + data = data[3:]
5.2410 + elif data[:4] == '\x00\x00\xfe\xff':
5.2411 + if _debug:
5.2412 + sys.stderr.write('stripping BOM\n')
5.2413 + if encoding != 'utf-32be':
5.2414 + sys.stderr.write('trying utf-32be instead\n')
5.2415 + encoding = 'utf-32be'
5.2416 + data = data[4:]
5.2417 + elif data[:4] == '\xff\xfe\x00\x00':
5.2418 + if _debug:
5.2419 + sys.stderr.write('stripping BOM\n')
5.2420 + if encoding != 'utf-32le':
5.2421 + sys.stderr.write('trying utf-32le instead\n')
5.2422 + encoding = 'utf-32le'
5.2423 + data = data[4:]
5.2424 + newdata = unicode(data, encoding)
5.2425 + if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
5.2426 + declmatch = re.compile('^<\?xml[^>]*?>')
5.2427 + newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
5.2428 + if declmatch.search(newdata):
5.2429 + newdata = declmatch.sub(newdecl, newdata)
5.2430 + else:
5.2431 + newdata = newdecl + u'\n' + newdata
5.2432 + return newdata.encode('utf-8')
5.2433 +
5.2434 +def _stripDoctype(data):
5.2435 + '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
5.2436 +
5.2437 + rss_version may be 'rss091n' or None
5.2438 + stripped_data is the same XML document, minus the DOCTYPE
5.2439 + '''
5.2440 + entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
5.2441 + data = entity_pattern.sub('', data)
5.2442 + doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
5.2443 + doctype_results = doctype_pattern.findall(data)
5.2444 + doctype = doctype_results and doctype_results[0] or ''
5.2445 + if doctype.lower().count('netscape'):
5.2446 + version = 'rss091n'
5.2447 + else:
5.2448 + version = None
5.2449 + data = doctype_pattern.sub('', data)
5.2450 + return version, data
5.2451 +
5.2452 +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
5.2453 + '''Parse a feed from a URL, file, stream, or string'''
5.2454 + result = FeedParserDict()
5.2455 + result['feed'] = FeedParserDict()
5.2456 + result['entries'] = []
5.2457 + if _XML_AVAILABLE:
5.2458 + result['bozo'] = 0
5.2459 + if type(handlers) == types.InstanceType:
5.2460 + handlers = [handlers]
5.2461 + try:
5.2462 + f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
5.2463 + data = f.read()
5.2464 + except Exception, e:
5.2465 + result['bozo'] = 1
5.2466 + result['bozo_exception'] = e
5.2467 + data = ''
5.2468 + f = None
5.2469 +
5.2470 + # if feed is gzip-compressed, decompress it
5.2471 + if f and data and hasattr(f, 'headers'):
5.2472 + if gzip and f.headers.get('content-encoding', '') == 'gzip':
5.2473 + try:
5.2474 + data = gzip.GzipFile(fileobj=_StringIO(data)).read()
5.2475 + except Exception, e:
5.2476 + # Some feeds claim to be gzipped but they're not, so
5.2477 + # we get garbage. Ideally, we should re-request the
5.2478 + # feed without the 'Accept-encoding: gzip' header,
5.2479 + # but we don't.
5.2480 + result['bozo'] = 1
5.2481 + result['bozo_exception'] = e
5.2482 + data = ''
5.2483 + elif zlib and f.headers.get('content-encoding', '') == 'deflate':
5.2484 + try:
5.2485 + data = zlib.decompress(data, -zlib.MAX_WBITS)
5.2486 + except Exception, e:
5.2487 + result['bozo'] = 1
5.2488 + result['bozo_exception'] = e
5.2489 + data = ''
5.2490 +
5.2491 + # save HTTP headers
5.2492 + if hasattr(f, 'info'):
5.2493 + info = f.info()
5.2494 + result['etag'] = info.getheader('ETag')
5.2495 + last_modified = info.getheader('Last-Modified')
5.2496 + if last_modified:
5.2497 + result['modified'] = _parse_date(last_modified)
5.2498 + if hasattr(f, 'url'):
5.2499 + result['href'] = f.url
5.2500 + result['status'] = 200
5.2501 + if hasattr(f, 'status'):
5.2502 + result['status'] = f.status
5.2503 + if hasattr(f, 'headers'):
5.2504 + result['headers'] = f.headers.dict
5.2505 + if hasattr(f, 'close'):
5.2506 + f.close()
5.2507 +
5.2508 + # there are four encodings to keep track of:
5.2509 + # - http_encoding is the encoding declared in the Content-Type HTTP header
5.2510 + # - xml_encoding is the encoding declared in the <?xml declaration
5.2511 + # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
5.2512 + # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
5.2513 + http_headers = result.get('headers', {})
5.2514 + result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
5.2515 + _getCharacterEncoding(http_headers, data)
5.2516 + if http_headers and (not acceptable_content_type):
5.2517 + if http_headers.has_key('content-type'):
5.2518 + bozo_message = '%s is not an XML media type' % http_headers['content-type']
5.2519 + else:
5.2520 + bozo_message = 'no Content-type specified'
5.2521 + result['bozo'] = 1
5.2522 + result['bozo_exception'] = NonXMLContentType(bozo_message)
5.2523 +
5.2524 + result['version'], data = _stripDoctype(data)
5.2525 +
5.2526 + baseuri = http_headers.get('content-location', result.get('href'))
5.2527 + baselang = http_headers.get('content-language', None)
5.2528 +
5.2529 + # if server sent 304, we're done
5.2530 + if result.get('status', 0) == 304:
5.2531 + result['version'] = ''
5.2532 + result['debug_message'] = 'The feed has not changed since you last checked, ' + \
5.2533 + 'so the server sent no data. This is a feature, not a bug!'
5.2534 + return result
5.2535 +
5.2536 + # if there was a problem downloading, we're done
5.2537 + if not data:
5.2538 + return result
5.2539 +
5.2540 + # determine character encoding
5.2541 + use_strict_parser = 0
5.2542 + known_encoding = 0
5.2543 + tried_encodings = []
5.2544 + # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
5.2545 + for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
5.2546 + if not proposed_encoding: continue
5.2547 + if proposed_encoding in tried_encodings: continue
5.2548 + tried_encodings.append(proposed_encoding)
5.2549 + try:
5.2550 + data = _toUTF8(data, proposed_encoding)
5.2551 + known_encoding = use_strict_parser = 1
5.2552 + break
5.2553 + except:
5.2554 + pass
5.2555 + # if no luck and we have auto-detection library, try that
5.2556 + if (not known_encoding) and chardet:
5.2557 + try:
5.2558 + proposed_encoding = chardet.detect(data)['encoding']
5.2559 + if proposed_encoding and (proposed_encoding not in tried_encodings):
5.2560 + tried_encodings.append(proposed_encoding)
5.2561 + data = _toUTF8(data, proposed_encoding)
5.2562 + known_encoding = use_strict_parser = 1
5.2563 + except:
5.2564 + pass
5.2565 + # if still no luck and we haven't tried utf-8 yet, try that
5.2566 + if (not known_encoding) and ('utf-8' not in tried_encodings):
5.2567 + try:
5.2568 + proposed_encoding = 'utf-8'
5.2569 + tried_encodings.append(proposed_encoding)
5.2570 + data = _toUTF8(data, proposed_encoding)
5.2571 + known_encoding = use_strict_parser = 1
5.2572 + except:
5.2573 + pass
5.2574 + # if still no luck and we haven't tried windows-1252 yet, try that
5.2575 + if (not known_encoding) and ('windows-1252' not in tried_encodings):
5.2576 + try:
5.2577 + proposed_encoding = 'windows-1252'
5.2578 + tried_encodings.append(proposed_encoding)
5.2579 + data = _toUTF8(data, proposed_encoding)
5.2580 + known_encoding = use_strict_parser = 1
5.2581 + except:
5.2582 + pass
5.2583 + # if still no luck, give up
5.2584 + if not known_encoding:
5.2585 + result['bozo'] = 1
5.2586 + result['bozo_exception'] = CharacterEncodingUnknown( \
5.2587 + 'document encoding unknown, I tried ' + \
5.2588 + '%s, %s, utf-8, and windows-1252 but nothing worked' % \
5.2589 + (result['encoding'], xml_encoding))
5.2590 + result['encoding'] = ''
5.2591 + elif proposed_encoding != result['encoding']:
5.2592 + result['bozo'] = 1
5.2593 + result['bozo_exception'] = CharacterEncodingOverride( \
5.2594 + 'documented declared as %s, but parsed as %s' % \
5.2595 + (result['encoding'], proposed_encoding))
5.2596 + result['encoding'] = proposed_encoding
5.2597 +
5.2598 + if not _XML_AVAILABLE:
5.2599 + use_strict_parser = 0
5.2600 + if use_strict_parser:
5.2601 + # initialize the SAX parser
5.2602 + feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
5.2603 + saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
5.2604 + saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
5.2605 + saxparser.setContentHandler(feedparser)
5.2606 + saxparser.setErrorHandler(feedparser)
5.2607 + source = xml.sax.xmlreader.InputSource()
5.2608 + source.setByteStream(_StringIO(data))
5.2609 + if hasattr(saxparser, '_ns_stack'):
5.2610 + # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
5.2611 + # PyXML doesn't have this problem, and it doesn't have _ns_stack either
5.2612 + saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
5.2613 + try:
5.2614 + saxparser.parse(source)
5.2615 + except Exception, e:
5.2616 + if _debug:
5.2617 + import traceback
5.2618 + traceback.print_stack()
5.2619 + traceback.print_exc()
5.2620 + sys.stderr.write('xml parsing failed\n')
5.2621 + result['bozo'] = 1
5.2622 + result['bozo_exception'] = feedparser.exc or e
5.2623 + use_strict_parser = 0
5.2624 + if not use_strict_parser:
5.2625 + feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
5.2626 + feedparser.feed(data)
5.2627 + result['feed'] = feedparser.feeddata
5.2628 + result['entries'] = feedparser.entries
5.2629 + result['version'] = result['version'] or feedparser.version
5.2630 + result['namespaces'] = feedparser.namespacesInUse
5.2631 + return result
5.2632 +
5.2633 +if __name__ == '__main__':
5.2634 + if not sys.argv[1:]:
5.2635 + print __doc__
5.2636 + sys.exit(0)
5.2637 + else:
5.2638 + urls = sys.argv[1:]
5.2639 + zopeCompatibilityHack()
5.2640 + from pprint import pprint
5.2641 + for url in urls:
5.2642 + print url
5.2643 + print
5.2644 + result = parse(url)
5.2645 + pprint(result)
5.2646 + print
5.2647 +
5.2648 +#REVISION HISTORY
5.2649 +#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
5.2650 +# added Simon Fell's test suite
5.2651 +#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
5.2652 +#2.0 - 10/19/2002
5.2653 +# JD - use inchannel to watch out for image and textinput elements which can
5.2654 +# also contain title, link, and description elements
5.2655 +# JD - check for isPermaLink='false' attribute on guid elements
5.2656 +# JD - replaced openAnything with open_resource supporting ETag and
5.2657 +# If-Modified-Since request headers
5.2658 +# JD - parse now accepts etag, modified, agent, and referrer optional
5.2659 +# arguments
5.2660 +# JD - modified parse to return a dictionary instead of a tuple so that any
5.2661 +# etag or modified information can be returned and cached by the caller
5.2662 +#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
5.2663 +# because of etag/modified, return the old etag/modified to the caller to
5.2664 +# indicate why nothing is being returned
5.2665 +#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
5.2666 +# useless. Fixes the problem JD was addressing by adding it.
5.2667 +#2.1 - 11/14/2002 - MAP - added gzip support
5.2668 +#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
5.2669 +# start_admingeneratoragent is an example of how to handle elements with
5.2670 +# only attributes, no content.
5.2671 +#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
5.2672 +# also, make sure we send the User-Agent even if urllib2 isn't available.
5.2673 +# Match any variation of backend.userland.com/rss namespace.
5.2674 +#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
5.2675 +#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
5.2676 +# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
5.2677 +# project name
5.2678 +#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
5.2679 +# removed unnecessary urllib code -- urllib2 should always be available anyway;
5.2680 +# return actual url, status, and full HTTP headers (as result['url'],
5.2681 +# result['status'], and result['headers']) if parsing a remote feed over HTTP --
5.2682 +# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
5.2683 +# added the latest namespace-of-the-week for RSS 2.0
5.2684 +#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
5.2685 +# User-Agent (otherwise urllib2 sends two, which confuses some servers)
5.2686 +#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
5.2687 +# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
5.2688 +#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
5.2689 +# textInput, and also to return the character encoding (if specified)
5.2690 +#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
5.2691 +# nested divs within content (JohnD); fixed missing sys import (JohanS);
5.2692 +# fixed regular expression to capture XML character encoding (Andrei);
5.2693 +# added support for Atom 0.3-style links; fixed bug with textInput tracking;
5.2694 +# added support for cloud (MartijnP); added support for multiple
5.2695 +# category/dc:subject (MartijnP); normalize content model: 'description' gets
5.2696 +# description (which can come from description, summary, or full content if no
5.2697 +# description), 'content' gets dict of base/language/type/value (which can come
5.2698 +# from content:encoded, xhtml:body, content, or fullitem);
5.2699 +# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
5.2700 +# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
5.2701 +# <content> element is not in default namespace (like Pocketsoap feed);
5.2702 +# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
5.2703 +# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
5.2704 +# description, xhtml:body, content, content:encoded, title, subtitle,
5.2705 +# summary, info, tagline, and copyright; added support for pingback and
5.2706 +# trackback namespaces
5.2707 +#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
5.2708 +# namespaces, as opposed to 2.6 when I said I did but didn't really;
5.2709 +# sanitize HTML markup within some elements; added mxTidy support (if
5.2710 +# installed) to tidy HTML markup within some elements; fixed indentation
5.2711 +# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
5.2712 +# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
5.2713 +# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
5.2714 +# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
5.2715 +# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
5.2716 +#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
5.2717 +# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
5.2718 +# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
5.2719 +#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
5.2720 +# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
5.2721 +# fixed relative URI processing for guid (skadz); added ICBM support; added
5.2722 +# base64 support
5.2723 +#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
5.2724 +# blogspot.com sites); added _debug variable
5.2725 +#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
5.2726 +#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
5.2727 +# added several new supported namespaces; fixed bug tracking naked markup in
5.2728 +# description; added support for enclosure; added support for source; re-added
5.2729 +# support for cloud which got dropped somehow; added support for expirationDate
5.2730 +#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
5.2731 +# xml:base URI, one for documents that don't define one explicitly and one for
5.2732 +# documents that define an outer and an inner xml:base that goes out of scope
5.2733 +# before the end of the document
5.2734 +#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
5.2735 +#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
5.2736 +# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
5.2737 +# added support for creativeCommons:license and cc:license; added support for
5.2738 +# full Atom content model in title, tagline, info, copyright, summary; fixed bug
5.2739 +# with gzip encoding (not always telling server we support it when we do)
5.2740 +#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
5.2741 +# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
5.2742 +# contains name + email address
5.2743 +#3.0b8 - 1/28/2004 - MAP - added support for contributor
5.2744 +#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
5.2745 +# support for summary
5.2746 +#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
5.2747 +# xml.util.iso8601
5.2748 +#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
5.2749 +# dangerous markup; fiddled with decodeEntities (not right); liberalized
5.2750 +# date parsing even further
5.2751 +#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
5.2752 +# added support to Atom 0.2 subtitle; added support for Atom content model
5.2753 +# in copyright; better sanitizing of dangerous HTML elements with end tags
5.2754 +# (script, frameset)
5.2755 +#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
5.2756 +# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
5.2757 +#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
5.2758 +# Python 2.1
5.2759 +#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
5.2760 +# fixed bug capturing author and contributor URL; fixed bug resolving relative
5.2761 +# links in author and contributor URL; fixed bug resolvin relative links in
5.2762 +# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
5.2763 +# namespace tests, and included them permanently in the test suite with his
5.2764 +# permission; fixed namespace handling under Python 2.1
5.2765 +#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
5.2766 +#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
5.2767 +#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
5.2768 +# use libxml2 (if available)
5.2769 +#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
5.2770 +# name was in parentheses; removed ultra-problematic mxTidy support; patch to
5.2771 +# workaround crash in PyXML/expat when encountering invalid entities
5.2772 +# (MarkMoraes); support for textinput/textInput
5.2773 +#3.0b20 - 4/7/2004 - MAP - added CDF support
5.2774 +#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
5.2775 +#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
5.2776 +# results dict; changed results dict to allow getting values with results.key
5.2777 +# as well as results[key]; work around embedded illformed HTML with half
5.2778 +# a DOCTYPE; work around malformed Content-Type header; if character encoding
5.2779 +# is wrong, try several common ones before falling back to regexes (if this
5.2780 +# works, bozo_exception is set to CharacterEncodingOverride); fixed character
5.2781 +# encoding issues in BaseHTMLProcessor by tracking encoding and converting
5.2782 +# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
5.2783 +# convert each value in results to Unicode (if possible), even if using
5.2784 +# regex-based parsing
5.2785 +#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
5.2786 +# high-bit characters in attributes in embedded HTML in description (thanks
5.2787 +# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
5.2788 +# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
5.2789 +# about a mapped key
5.2790 +#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
5.2791 +# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
5.2792 +# cause the same encoding to be tried twice (even if it failed the first time);
5.2793 +# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
5.2794 +# better textinput and image tracking in illformed RSS 1.0 feeds
5.2795 +#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
5.2796 +# my blink tag tests
5.2797 +#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
5.2798 +# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
5.2799 +# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
5.2800 +# added support for image; refactored parse() fallback logic to try other
5.2801 +# encodings if SAX parsing fails (previously it would only try other encodings
5.2802 +# if re-encoding failed); remove unichr madness in normalize_attrs now that
5.2803 +# we're properly tracking encoding in and out of BaseHTMLProcessor; set
5.2804 +# feed.language from root-level xml:lang; set entry.id from rdf:about;
5.2805 +# send Accept header
5.2806 +#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
5.2807 +# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
5.2808 +# windows-1252); fixed regression that could cause the same encoding to be
5.2809 +# tried twice (even if it failed the first time)
5.2810 +#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
5.2811 +# recover from malformed content-type header parameter with no equals sign
5.2812 +# ('text/xml; charset:iso-8859-1')
5.2813 +#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
5.2814 +# to Unicode equivalents in illformed feeds (aaronsw); added and
5.2815 +# passed tests for converting character entities to Unicode equivalents
5.2816 +# in illformed feeds (aaronsw); test for valid parsers when setting
5.2817 +# XML_AVAILABLE; make version and encoding available when server returns
5.2818 +# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
5.2819 +# digest auth or proxy support); add code to parse username/password
5.2820 +# out of url and send as basic authentication; expose downloading-related
5.2821 +# exceptions in bozo_exception (aaronsw); added __contains__ method to
5.2822 +# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
5.2823 +#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
5.2824 +# convert feed to UTF-8 before passing to XML parser; completely revamped
5.2825 +# logic for determining character encoding and attempting XML parsing
5.2826 +# (much faster); increased default timeout to 20 seconds; test for presence
5.2827 +# of Location header on redirects; added tests for many alternate character
5.2828 +# encodings; support various EBCDIC encodings; support UTF-16BE and
5.2829 +# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
5.2830 +# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
5.2831 +# XML parsers are available; added support for 'Content-encoding: deflate';
5.2832 +# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
5.2833 +# are available
5.2834 +#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
5.2835 +# problem tracking xml:base and xml:lang if element declares it, child
5.2836 +# doesn't, first grandchild redeclares it, and second grandchild doesn't;
5.2837 +# refactored date parsing; defined public registerDateHandler so callers
5.2838 +# can add support for additional date formats at runtime; added support
5.2839 +# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
5.2840 +# zopeCompatibilityHack() which turns FeedParserDict into a regular
5.2841 +# dictionary, required for Zope compatibility, and also makes command-
5.2842 +# line debugging easier because pprint module formats real dictionaries
5.2843 +# better than dictionary-like objects; added NonXMLContentType exception,
5.2844 +# which is stored in bozo_exception when a feed is served with a non-XML
5.2845 +# media type such as 'text/plain'; respect Content-Language as default
5.2846 +# language if not xml:lang is present; cloud dict is now FeedParserDict;
5.2847 +# generator dict is now FeedParserDict; better tracking of xml:lang,
5.2848 +# including support for xml:lang='' to unset the current language;
5.2849 +# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
5.2850 +# namespace; don't overwrite final status on redirects (scenarios:
5.2851 +# redirecting to a URL that returns 304, redirecting to a URL that
5.2852 +# redirects to another URL with a different type of redirect); add
5.2853 +# support for HTTP 303 redirects
5.2854 +#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
5.2855 +# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
5.2856 +# support for Atom 1.0; support for iTunes extensions; new 'tags' for
5.2857 +# categories/keywords/etc. as array of dict
5.2858 +# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
5.2859 +# terminology; parse RFC 822-style dates with no time; lots of other
5.2860 +# bug fixes
5.2861 +#4.1 - MAP - removed socket timeout; added support for chardet library
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
6.2 +++ b/trunk/quahog/plugins/Weather/local/simplejson/__init__.py Thu Oct 22 10:14:56 2009 -0400
6.3 @@ -0,0 +1,318 @@
6.4 +r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
6.5 +JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
6.6 +interchange format.
6.7 +
6.8 +:mod:`simplejson` exposes an API familiar to users of the standard library
6.9 +:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
6.10 +version of the :mod:`json` library contained in Python 2.6, but maintains
6.11 +compatibility with Python 2.4 and Python 2.5 and (currently) has
6.12 +significant performance advantages, even without using the optional C
6.13 +extension for speedups.
6.14 +
6.15 +Encoding basic Python object hierarchies::
6.16 +
6.17 + >>> import simplejson as json
6.18 + >>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
6.19 + '["foo", {"bar": ["baz", null, 1.0, 2]}]'
6.20 + >>> print json.dumps("\"foo\bar")
6.21 + "\"foo\bar"
6.22 + >>> print json.dumps(u'\u1234')
6.23 + "\u1234"
6.24 + >>> print json.dumps('\\')
6.25 + "\\"
6.26 + >>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
6.27 + {"a": 0, "b": 0, "c": 0}
6.28 + >>> from StringIO import StringIO
6.29 + >>> io = StringIO()
6.30 + >>> json.dump(['streaming API'], io)
6.31 + >>> io.getvalue()
6.32 + '["streaming API"]'
6.33 +
6.34 +Compact encoding::
6.35 +
6.36 + >>> import simplejson as json
6.37 + >>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
6.38 + '[1,2,3,{"4":5,"6":7}]'
6.39 +
6.40 +Pretty printing::
6.41 +
6.42 + >>> import simplejson as json
6.43 + >>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4)
6.44 + >>> print '\n'.join([l.rstrip() for l in s.splitlines()])
6.45 + {
6.46 + "4": 5,
6.47 + "6": 7
6.48 + }
6.49 +
6.50 +Decoding JSON::
6.51 +
6.52 + >>> import simplejson as json
6.53 + >>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
6.54 + >>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
6.55 + True
6.56 + >>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
6.57 + True
6.58 + >>> from StringIO import StringIO
6.59 + >>> io = StringIO('["streaming API"]')
6.60 + >>> json.load(io)[0] == 'streaming API'
6.61 + True
6.62 +
6.63 +Specializing JSON object decoding::
6.64 +
6.65 + >>> import simplejson as json
6.66 + >>> def as_complex(dct):
6.67 + ... if '__complex__' in dct:
6.68 + ... return complex(dct['real'], dct['imag'])
6.69 + ... return dct
6.70 + ...
6.71 + >>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
6.72 + ... object_hook=as_complex)
6.73 + (1+2j)
6.74 + >>> import decimal
6.75 + >>> json.loads('1.1', parse_float=decimal.Decimal) == decimal.Decimal('1.1')
6.76 + True
6.77 +
6.78 +Specializing JSON object encoding::
6.79 +
6.80 + >>> import simplejson as json
6.81 + >>> def encode_complex(obj):
6.82 + ... if isinstance(obj, complex):
6.83 + ... return [obj.real, obj.imag]
6.84 + ... raise TypeError(repr(o) + " is not JSON serializable")
6.85 + ...
6.86 + >>> json.dumps(2 + 1j, default=encode_complex)
6.87 + '[2.0, 1.0]'
6.88 + >>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
6.89 + '[2.0, 1.0]'
6.90 + >>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
6.91 + '[2.0, 1.0]'
6.92 +
6.93 +
6.94 +Using simplejson.tool from the shell to validate and pretty-print::
6.95 +
6.96 + $ echo '{"json":"obj"}' | python -m simplejson.tool
6.97 + {
6.98 + "json": "obj"
6.99 + }
6.100 + $ echo '{ 1.2:3.4}' | python -m simplejson.tool
6.101 + Expecting property name: line 1 column 2 (char 2)
6.102 +"""
6.103 +__version__ = '2.0.9'
6.104 +__all__ = [
6.105 + 'dump', 'dumps', 'load', 'loads',
6.106 + 'JSONDecoder', 'JSONEncoder',
6.107 +]
6.108 +
6.109 +__author__ = 'Bob Ippolito <bob@redivi.com>'
6.110 +
6.111 +from decoder import JSONDecoder
6.112 +from encoder import JSONEncoder
6.113 +
6.114 +_default_encoder = JSONEncoder(
6.115 + skipkeys=False,
6.116 + ensure_ascii=True,
6.117 + check_circular=True,
6.118 + allow_nan=True,
6.119 + indent=None,
6.120 + separators=None,
6.121 + encoding='utf-8',
6.122 + default=None,
6.123 +)
6.124 +
6.125 +def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
6.126 + allow_nan=True, cls=None, indent=None, separators=None,
6.127 + encoding='utf-8', default=None, **kw):
6.128 + """Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
6.129 + ``.write()``-supporting file-like object).
6.130 +
6.131 + If ``skipkeys`` is true then ``dict`` keys that are not basic types
6.132 + (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
6.133 + will be skipped instead of raising a ``TypeError``.
6.134 +
6.135 + If ``ensure_ascii`` is false, then the some chunks written to ``fp``
6.136 + may be ``unicode`` instances, subject to normal Python ``str`` to
6.137 + ``unicode`` coercion rules. Unless ``fp.write()`` explicitly
6.138 + understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
6.139 + to cause an error.
6.140 +
6.141 + If ``check_circular`` is false, then the circular reference check
6.142 + for container types will be skipped and a circular reference will
6.143 + result in an ``OverflowError`` (or worse).
6.144 +
6.145 + If ``allow_nan`` is false, then it will be a ``ValueError`` to
6.146 + serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
6.147 + in strict compliance of the JSON specification, instead of using the
6.148 + JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
6.149 +
6.150 + If ``indent`` is a non-negative integer, then JSON array elements and object
6.151 + members will be pretty-printed with that indent level. An indent level
6.152 + of 0 will only insert newlines. ``None`` is the most compact representation.
6.153 +
6.154 + If ``separators`` is an ``(item_separator, dict_separator)`` tuple
6.155 + then it will be used instead of the default ``(', ', ': ')`` separators.
6.156 + ``(',', ':')`` is the most compact JSON representation.
6.157 +
6.158 + ``encoding`` is the character encoding for str instances, default is UTF-8.
6.159 +
6.160 + ``default(obj)`` is a function that should return a serializable version
6.161 + of obj or raise TypeError. The default simply raises TypeError.
6.162 +
6.163 + To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
6.164 + ``.default()`` method to serialize additional types), specify it with
6.165 + the ``cls`` kwarg.
6.166 +
6.167 + """
6.168 + # cached encoder
6.169 + if (not skipkeys and ensure_ascii and
6.170 + check_circular and allow_nan and
6.171 + cls is None and indent is None and separators is None and
6.172 + encoding == 'utf-8' and default is None and not kw):
6.173 + iterable = _default_encoder.iterencode(obj)
6.174 + else:
6.175 + if cls is None:
6.176 + cls = JSONEncoder
6.177 + iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
6.178 + check_circular=check_circular, allow_nan=allow_nan, indent=indent,
6.179 + separators=separators, encoding=encoding,
6.180 + default=default, **kw).iterencode(obj)
6.181 + # could accelerate with writelines in some versions of Python, at
6.182 + # a debuggability cost
6.183 + for chunk in iterable:
6.184 + fp.write(chunk)
6.185 +
6.186 +
6.187 +def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
6.188 + allow_nan=True, cls=None, indent=None, separators=None,
6.189 + encoding='utf-8', default=None, **kw):
6.190 + """Serialize ``obj`` to a JSON formatted ``str``.
6.191 +
6.192 + If ``skipkeys`` is false then ``dict`` keys that are not basic types
6.193 + (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
6.194 + will be skipped instead of raising a ``TypeError``.
6.195 +
6.196 + If ``ensure_ascii`` is false, then the return value will be a
6.197 + ``unicode`` instance subject to normal Python ``str`` to ``unicode``
6.198 + coercion rules instead of being escaped to an ASCII ``str``.
6.199 +
6.200 + If ``check_circular`` is false, then the circular reference check
6.201 + for container types will be skipped and a circular reference will
6.202 + result in an ``OverflowError`` (or worse).
6.203 +
6.204 + If ``allow_nan`` is false, then it will be a ``ValueError`` to
6.205 + serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
6.206 + strict compliance of the JSON specification, instead of using the
6.207 + JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
6.208 +
6.209 + If ``indent`` is a non-negative integer, then JSON array elements and
6.210 + object members will be pretty-printed with that indent level. An indent
6.211 + level of 0 will only insert newlines. ``None`` is the most compact
6.212 + representation.
6.213 +
6.214 + If ``separators`` is an ``(item_separator, dict_separator)`` tuple
6.215 + then it will be used instead of the default ``(', ', ': ')`` separators.
6.216 + ``(',', ':')`` is the most compact JSON representation.
6.217 +
6.218 + ``encoding`` is the character encoding for str instances, default is UTF-8.
6.219 +
6.220 + ``default(obj)`` is a function that should return a serializable version
6.221 + of obj or raise TypeError. The default simply raises TypeError.
6.222 +
6.223 + To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
6.224 + ``.default()`` method to serialize additional types), specify it with
6.225 + the ``cls`` kwarg.
6.226 +
6.227 + """
6.228 + # cached encoder
6.229 + if (not skipkeys and ensure_ascii and
6.230 + check_circular and allow_nan and
6.231 + cls is None and indent is None and separators is None and
6.232 + encoding == 'utf-8' and default is None and not kw):
6.233 + return _default_encoder.encode(obj)
6.234 + if cls is None:
6.235 + cls = JSONEncoder
6.236 + return cls(
6.237 + skipkeys=skipkeys, ensure_ascii=ensure_ascii,
6.238 + check_circular=check_circular, allow_nan=allow_nan, indent=indent,
6.239 + separators=separators, encoding=encoding, default=default,
6.240 + **kw).encode(obj)
6.241 +
6.242 +
6.243 +_default_decoder = JSONDecoder(encoding=None, object_hook=None)
6.244 +
6.245 +
6.246 +def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
6.247 + parse_int=None, parse_constant=None, **kw):
6.248 + """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
6.249 + a JSON document) to a Python object.
6.250 +
6.251 + If the contents of ``fp`` is encoded with an ASCII based encoding other
6.252 + than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must
6.253 + be specified. Encodings that are not ASCII based (such as UCS-2) are
6.254 + not allowed, and should be wrapped with
6.255 + ``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode``
6.256 + object and passed to ``loads()``
6.257 +
6.258 + ``object_hook`` is an optional function that will be called with the
6.259 + result of any object literal decode (a ``dict``). The return value of
6.260 + ``object_hook`` will be used instead of the ``dict``. This feature
6.261 + can be used to implement custom decoders (e.g. JSON-RPC class hinting).
6.262 +
6.263 + To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
6.264 + kwarg.
6.265 +
6.266 + """
6.267 + return loads(fp.read(),
6.268 + encoding=encoding, cls=cls, object_hook=object_hook,
6.269 + parse_float=parse_float, parse_int=parse_int,
6.270 + parse_constant=parse_constant, **kw)
6.271 +
6.272 +
6.273 +def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
6.274 + parse_int=None, parse_constant=None, **kw):
6.275 + """Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
6.276 + document) to a Python object.
6.277 +
6.278 + If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding
6.279 + other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name
6.280 + must be specified. Encodings that are not ASCII based (such as UCS-2)
6.281 + are not allowed and should be decoded to ``unicode`` first.
6.282 +
6.283 + ``object_hook`` is an optional function that will be called with the
6.284 + result of any object literal decode (a ``dict``). The return value of
6.285 + ``object_hook`` will be used instead of the ``dict``. This feature
6.286 + can be used to implement custom decoders (e.g. JSON-RPC class hinting).
6.287 +
6.288 + ``parse_float``, if specified, will be called with the string
6.289 + of every JSON float to be decoded. By default this is equivalent to
6.290 + float(num_str). This can be used to use another datatype or parser
6.291 + for JSON floats (e.g. decimal.Decimal).
6.292 +
6.293 + ``parse_int``, if specified, will be called with the string
6.294 + of every JSON int to be decoded. By default this is equivalent to
6.295 + int(num_str). This can be used to use another datatype or parser
6.296 + for JSON integers (e.g. float).
6.297 +
6.298 + ``parse_constant``, if specified, will be called with one of the
6.299 + following strings: -Infinity, Infinity, NaN, null, true, false.
6.300 + This can be used to raise an exception if invalid JSON numbers
6.301 + are encountered.
6.302 +
6.303 + To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
6.304 + kwarg.
6.305 +
6.306 + """
6.307 + if (cls is None and encoding is None and object_hook is None and
6.308 + parse_int is None and parse_float is None and
6.309 + parse_constant is None and not kw):
6.310 + return _default_decoder.decode(s)
6.311 + if cls is None:
6.312 + cls = JSONDecoder
6.313 + if object_hook is not None:
6.314 + kw['object_hook'] = object_hook
6.315 + if parse_float is not None:
6.316 + kw['parse_float'] = parse_float
6.317 + if parse_int is not None:
6.318 + kw['parse_int'] = parse_int
6.319 + if parse_constant is not None:
6.320 + kw['parse_constant'] = parse_constant
6.321 + return cls(encoding=encoding, **kw).decode(s)
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/trunk/quahog/plugins/Weather/local/simplejson/decoder.py Thu Oct 22 10:14:56 2009 -0400
7.3 @@ -0,0 +1,354 @@
7.4 +"""Implementation of JSONDecoder
7.5 +"""
7.6 +import re
7.7 +import sys
7.8 +import struct
7.9 +
7.10 +from scanner import make_scanner
7.11 +try:
7.12 + from _speedups import scanstring as c_scanstring
7.13 +except ImportError:
7.14 + c_scanstring = None
7.15 +
7.16 +__all__ = ['JSONDecoder']
7.17 +
7.18 +FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
7.19 +
7.20 +def _floatconstants():
7.21 + _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
7.22 + if sys.byteorder != 'big':
7.23 + _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
7.24 + nan, inf = struct.unpack('dd', _BYTES)
7.25 + return nan, inf, -inf
7.26 +
7.27 +NaN, PosInf, NegInf = _floatconstants()
7.28 +
7.29 +
7.30 +def linecol(doc, pos):
7.31 + lineno = doc.count('\n', 0, pos) + 1
7.32 + if lineno == 1:
7.33 + colno = pos
7.34 + else:
7.35 + colno = pos - doc.rindex('\n', 0, pos)
7.36 + return lineno, colno
7.37 +
7.38 +
7.39 +def errmsg(msg, doc, pos, end=None):
7.40 + # Note that this function is called from _speedups
7.41 + lineno, colno = linecol(doc, pos)
7.42 + if end is None:
7.43 + #fmt = '{0}: line {1} column {2} (char {3})'
7.44 + #return fmt.format(msg, lineno, colno, pos)
7.45 + fmt = '%s: line %d column %d (char %d)'
7.46 + return fmt % (msg, lineno, colno, pos)
7.47 + endlineno, endcolno = linecol(doc, end)
7.48 + #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
7.49 + #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
7.50 + fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
7.51 + return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
7.52 +
7.53 +
7.54 +_CONSTANTS = {
7.55 + '-Infinity': NegInf,
7.56 + 'Infinity': PosInf,
7.57 + 'NaN': NaN,
7.58 +}
7.59 +
7.60 +STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
7.61 +BACKSLASH = {
7.62 + '"': u'"', '\\': u'\\', '/': u'/',
7.63 + 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
7.64 +}
7.65 +
7.66 +DEFAULT_ENCODING = "utf-8"
7.67 +
7.68 +def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
7.69 + """Scan the string s for a JSON string. End is the index of the
7.70 + character in s after the quote that started the JSON string.
7.71 + Unescapes all valid JSON string escape sequences and raises ValueError
7.72 + on attempt to decode an invalid string. If strict is False then literal
7.73 + control characters are allowed in the string.
7.74 +
7.75 + Returns a tuple of the decoded string and the index of the character in s
7.76 + after the end quote."""
7.77 + if encoding is None:
7.78 + encoding = DEFAULT_ENCODING
7.79 + chunks = []
7.80 + _append = chunks.append
7.81 + begin = end - 1
7.82 + while 1:
7.83 + chunk = _m(s, end)
7.84 + if chunk is None:
7.85 + raise ValueError(
7.86 + errmsg("Unterminated string starting at", s, begin))
7.87 + end = chunk.end()
7.88 + content, terminator = chunk.groups()
7.89 + # Content is contains zero or more unescaped string characters
7.90 + if content:
7.91 + if not isinstance(content, unicode):
7.92 + content = unicode(content, encoding)
7.93 + _append(content)
7.94 + # Terminator is the end of string, a literal control character,
7.95 + # or a backslash denoting that an escape sequence follows
7.96 + if terminator == '"':
7.97 + break
7.98 + elif terminator != '\\':
7.99 + if strict:
7.100 + msg = "Invalid control character %r at" % (terminator,)
7.101 + #msg = "Invalid control character {0!r} at".format(terminator)
7.102 + raise ValueError(errmsg(msg, s, end))
7.103 + else:
7.104 + _append(terminator)
7.105 + continue
7.106 + try:
7.107 + esc = s[end]
7.108 + except IndexError:
7.109 + raise ValueError(
7.110 + errmsg("Unterminated string starting at", s, begin))
7.111 + # If not a unicode escape sequence, must be in the lookup table
7.112 + if esc != 'u':
7.113 + try:
7.114 + char = _b[esc]
7.115 + except KeyError:
7.116 + msg = "Invalid \\escape: " + repr(esc)
7.117 + raise ValueError(errmsg(msg, s, end))
7.118 + end += 1
7.119 + else:
7.120 + # Unicode escape sequence
7.121 + esc = s[end + 1:end + 5]
7.122 + next_end = end + 5
7.123 + if len(esc) != 4:
7.124 + msg = "Invalid \\uXXXX escape"
7.125 + raise ValueError(errmsg(msg, s, end))
7.126 + uni = int(esc, 16)
7.127 + # Check for surrogate pair on UCS-4 systems
7.128 + if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
7.129 + msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
7.130 + if not s[end + 5:end + 7] == '\\u':
7.131 + raise ValueError(errmsg(msg, s, end))
7.132 + esc2 = s[end + 7:end + 11]
7.133 + if len(esc2) != 4:
7.134 + raise ValueError(errmsg(msg, s, end))
7.135 + uni2 = int(esc2, 16)
7.136 + uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
7.137 + next_end += 6
7.138 + char = unichr(uni)
7.139 + end = next_end
7.140 + # Append the unescaped character
7.141 + _append(char)
7.142 + return u''.join(chunks), end
7.143 +
7.144 +
7.145 +# Use speedup if available
7.146 +scanstring = c_scanstring or py_scanstring
7.147 +
7.148 +WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
7.149 +WHITESPACE_STR = ' \t\n\r'
7.150 +
7.151 +def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
7.152 + pairs = {}
7.153 + # Use a slice to prevent IndexError from being raised, the following
7.154 + # check will raise a more specific ValueError if the string is empty
7.155 + nextchar = s[end:end + 1]
7.156 + # Normally we expect nextchar == '"'
7.157 + if nextchar != '"':
7.158 + if nextchar in _ws:
7.159 + end = _w(s, end).end()
7.160 + nextchar = s[end:end + 1]
7.161 + # Trivial empty object
7.162 + if nextchar == '}':
7.163 + return pairs, end + 1
7.164 + elif nextchar != '"':
7.165 + raise ValueError(errmsg("Expecting property name", s, end))
7.166 + end += 1
7.167 + while True:
7.168 + key, end = scanstring(s, end, encoding, strict)
7.169 +
7.170 + # To skip some function call overhead we optimize the fast paths where
7.171 + # the JSON key separator is ": " or just ":".
7.172 + if s[end:end + 1] != ':':
7.173 + end = _w(s, end).end()
7.174 + if s[end:end + 1] != ':':
7.175 + raise ValueError(errmsg("Expecting : delimiter", s, end))
7.176 +
7.177 + end += 1
7.178 +
7.179 + try:
7.180 + if s[end] in _ws:
7.181 + end += 1
7.182 + if s[end] in _ws:
7.183 + end = _w(s, end + 1).end()
7.184 + except IndexError:
7.185 + pass
7.186 +
7.187 + try:
7.188 + value, end = scan_once(s, end)
7.189 + except StopIteration:
7.190 + raise ValueError(errmsg("Expecting object", s, end))
7.191 + pairs[key] = value
7.192 +
7.193 + try:
7.194 + nextchar = s[end]
7.195 + if nextchar in _ws:
7.196 + end = _w(s, end + 1).end()
7.197 + nextchar = s[end]
7.198 + except IndexError:
7.199 + nextchar = ''
7.200 + end += 1
7.201 +
7.202 + if nextchar == '}':
7.203 + break
7.204 + elif nextchar != ',':
7.205 + raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
7.206 +
7.207 + try:
7.208 + nextchar = s[end]
7.209 + if nextchar in _ws:
7.210 + end += 1
7.211 + nextchar = s[end]
7.212 + if nextchar in _ws:
7.213 + end = _w(s, end + 1).end()
7.214 + nextchar = s[end]
7.215 + except IndexError:
7.216 + nextchar = ''
7.217 +
7.218 + end += 1
7.219 + if nextchar != '"':
7.220 + raise ValueError(errmsg("Expecting property name", s, end - 1))
7.221 +
7.222 + if object_hook is not None:
7.223 + pairs = object_hook(pairs)
7.224 + return pairs, end
7.225 +
7.226 +def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
7.227 + values = []
7.228 + nextchar = s[end:end + 1]
7.229 + if nextchar in _ws:
7.230 + end = _w(s, end + 1).end()
7.231 + nextchar = s[end:end + 1]
7.232 + # Look-ahead for trivial empty array
7.233 + if nextchar == ']':
7.234 + return values, end + 1
7.235 + _append = values.append
7.236 + while True:
7.237 + try:
7.238 + value, end = scan_once(s, end)
7.239 + except StopIteration:
7.240 + raise ValueError(errmsg("Expecting object", s, end))
7.241 + _append(value)
7.242 + nextchar = s[end:end + 1]
7.243 + if nextchar in _ws:
7.244 + end = _w(s, end + 1).end()
7.245 + nextchar = s[end:end + 1]
7.246 + end += 1
7.247 + if nextchar == ']':
7.248 + break
7.249 + elif nextchar != ',':
7.250 + raise ValueError(errmsg("Expecting , delimiter", s, end))
7.251 +
7.252 + try:
7.253 + if s[end] in _ws:
7.254 + end += 1
7.255 + if s[end] in _ws:
7.256 + end = _w(s, end + 1).end()
7.257 + except IndexError:
7.258 + pass
7.259 +
7.260 + return values, end
7.261 +
7.262 +class JSONDecoder(object):
7.263 + """Simple JSON <http://json.org> decoder
7.264 +
7.265 + Performs the following translations in decoding by default:
7.266 +
7.267 + +---------------+-------------------+
7.268 + | JSON | Python |
7.269 + +===============+===================+
7.270 + | object | dict |
7.271 + +---------------+-------------------+
7.272 + | array | list |
7.273 + +---------------+-------------------+
7.274 + | string | unicode |
7.275 + +---------------+-------------------+
7.276 + | number (int) | int, long |
7.277 + +---------------+-------------------+
7.278 + | number (real) | float |
7.279 + +---------------+-------------------+
7.280 + | true | True |
7.281 + +---------------+-------------------+
7.282 + | false | False |
7.283 + +---------------+-------------------+
7.284 + | null | None |
7.285 + +---------------+-------------------+
7.286 +
7.287 + It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
7.288 + their corresponding ``float`` values, which is outside the JSON spec.
7.289 +
7.290 + """
7.291 +
7.292 + def __init__(self, encoding=None, object_hook=None, parse_float=None,
7.293 + parse_int=None, parse_constant=None, strict=True):
7.294 + """``encoding`` determines the encoding used to interpret any ``str``
7.295 + objects decoded by this instance (utf-8 by default). It has no
7.296 + effect when decoding ``unicode`` objects.
7.297 +
7.298 + Note that currently only encodings that are a superset of ASCII work,
7.299 + strings of other encodings should be passed in as ``unicode``.
7.300 +
7.301 + ``object_hook``, if specified, will be called with the result
7.302 + of every JSON object decoded and its return value will be used in
7.303 + place of the given ``dict``. This can be used to provide custom
7.304 + deserializations (e.g. to support JSON-RPC class hinting).
7.305 +
7.306 + ``parse_float``, if specified, will be called with the string
7.307 + of every JSON float to be decoded. By default this is equivalent to
7.308 + float(num_str). This can be used to use another datatype or parser
7.309 + for JSON floats (e.g. decimal.Decimal).
7.310 +
7.311 + ``parse_int``, if specified, will be called with the string
7.312 + of every JSON int to be decoded. By default this is equivalent to
7.313 + int(num_str). This can be used to use another datatype or parser
7.314 + for JSON integers (e.g. float).
7.315 +
7.316 + ``parse_constant``, if specified, will be called with one of the
7.317 + following strings: -Infinity, Infinity, NaN.
7.318 + This can be used to raise an exception if invalid JSON numbers
7.319 + are encountered.
7.320 +
7.321 + """
7.322 + self.encoding = encoding
7.323 + self.object_hook = object_hook
7.324 + self.parse_float = parse_float or float
7.325 + self.parse_int = parse_int or int
7.326 + self.parse_constant = parse_constant or _CONSTANTS.__getitem__
7.327 + self.strict = strict
7.328 + self.parse_object = JSONObject
7.329 + self.parse_array = JSONArray
7.330 + self.parse_string = scanstring
7.331 + self.scan_once = make_scanner(self)
7.332 +
7.333 + def decode(self, s, _w=WHITESPACE.match):
7.334 + """Return the Python representation of ``s`` (a ``str`` or ``unicode``
7.335 + instance containing a JSON document)
7.336 +
7.337 + """
7.338 + obj, end = self.raw_decode(s, idx=_w(s, 0).end())
7.339 + end = _w(s, end).end()
7.340 + if end != len(s):
7.341 + raise ValueError(errmsg("Extra data", s, end, len(s)))
7.342 + return obj
7.343 +
7.344 + def raw_decode(self, s, idx=0):
7.345 + """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
7.346 + with a JSON document) and return a 2-tuple of the Python
7.347 + representation and the index in ``s`` where the document ended.
7.348 +
7.349 + This can be used to decode a JSON document from a string that may
7.350 + have extraneous data at the end.
7.351 +
7.352 + """
7.353 + try:
7.354 + obj, end = self.scan_once(s, idx)
7.355 + except StopIteration:
7.356 + raise ValueError("No JSON object could be decoded")
7.357 + return obj, end
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2 +++ b/trunk/quahog/plugins/Weather/local/simplejson/encoder.py Thu Oct 22 10:14:56 2009 -0400
8.3 @@ -0,0 +1,440 @@
8.4 +"""Implementation of JSONEncoder
8.5 +"""
8.6 +import re
8.7 +
8.8 +try:
8.9 + from _speedups import encode_basestring_ascii as c_encode_basestring_ascii
8.10 +except ImportError:
8.11 + c_encode_basestring_ascii = None
8.12 +try:
8.13 + from _speedups import make_encoder as c_make_encoder
8.14 +except ImportError:
8.15 + c_make_encoder = None
8.16 +
8.17 +ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
8.18 +ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
8.19 +HAS_UTF8 = re.compile(r'[\x80-\xff]')
8.20 +ESCAPE_DCT = {
8.21 + '\\': '\\\\',
8.22 + '"': '\\"',
8.23 + '\b': '\\b',
8.24 + '\f': '\\f',
8.25 + '\n': '\\n',
8.26 + '\r': '\\r',
8.27 + '\t': '\\t',
8.28 +}
8.29 +for i in range(0x20):
8.30 + #ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
8.31 + ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
8.32 +
8.33 +# Assume this produces an infinity on all machines (probably not guaranteed)
8.34 +INFINITY = float('1e66666')
8.35 +FLOAT_REPR = repr
8.36 +
8.37 +def encode_basestring(s):
8.38 + """Return a JSON representation of a Python string
8.39 +
8.40 + """
8.41 + def replace(match):
8.42 + return ESCAPE_DCT[match.group(0)]
8.43 + return '"' + ESCAPE.sub(replace, s) + '"'
8.44 +
8.45 +
8.46 +def py_encode_basestring_ascii(s):
8.47 + """Return an ASCII-only JSON representation of a Python string
8.48 +
8.49 + """
8.50 + if isinstance(s, str) and HAS_UTF8.search(s) is not None:
8.51 + s = s.decode('utf-8')
8.52 + def replace(match):
8.53 + s = match.group(0)
8.54 + try:
8.55 + return ESCAPE_DCT[s]
8.56 + except KeyError:
8.57 + n = ord(s)
8.58 + if n < 0x10000:
8.59 + #return '\\u{0:04x}'.format(n)
8.60 + return '\\u%04x' % (n,)
8.61 + else:
8.62 + # surrogate pair
8.63 + n -= 0x10000
8.64 + s1 = 0xd800 | ((n >> 10) & 0x3ff)
8.65 + s2 = 0xdc00 | (n & 0x3ff)
8.66 + #return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
8.67 + return '\\u%04x\\u%04x' % (s1, s2)
8.68 + return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
8.69 +
8.70 +
8.71 +encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii
8.72 +
8.73 +class JSONEncoder(object):
8.74 + """Extensible JSON <http://json.org> encoder for Python data structures.
8.75 +
8.76 + Supports the following objects and types by default:
8.77 +
8.78 + +-------------------+---------------+
8.79 + | Python | JSON |
8.80 + +===================+===============+
8.81 + | dict | object |
8.82 + +-------------------+---------------+
8.83 + | list, tuple | array |
8.84 + +-------------------+---------------+
8.85 + | str, unicode | string |
8.86 + +-------------------+---------------+
8.87 + | int, long, float | number |
8.88 + +-------------------+---------------+
8.89 + | True | true |
8.90 + +-------------------+---------------+
8.91 + | False | false |
8.92 + +-------------------+---------------+
8.93 + | None | null |
8.94 + +-------------------+---------------+
8.95 +
8.96 + To extend this to recognize other objects, subclass and implement a
8.97 + ``.default()`` method with another method that returns a serializable
8.98 + object for ``o`` if possible, otherwise it should call the superclass
8.99 + implementation (to raise ``TypeError``).
8.100 +
8.101 + """
8.102 + item_separator = ', '
8.103 + key_separator = ': '
8.104 + def __init__(self, skipkeys=False, ensure_ascii=True,
8.105 + check_circular=True, allow_nan=True, sort_keys=False,
8.106 + indent=None, separators=None, encoding='utf-8', default=None):
8.107 + """Constructor for JSONEncoder, with sensible defaults.
8.108 +
8.109 + If skipkeys is false, then it is a TypeError to attempt
8.110 + encoding of keys that are not str, int, long, float or None. If
8.111 + skipkeys is True, such items are simply skipped.
8.112 +
8.113 + If ensure_ascii is true, the output is guaranteed to be str
8.114 + objects with all incoming unicode characters escaped. If
8.115 + ensure_ascii is false, the output will be unicode object.
8.116 +
8.117 + If check_circular is true, then lists, dicts, and custom encoded
8.118 + objects will be checked for circular references during encoding to
8.119 + prevent an infinite recursion (which would cause an OverflowError).
8.120 + Otherwise, no such check takes place.
8.121 +
8.122 + If allow_nan is true, then NaN, Infinity, and -Infinity will be
8.123 + encoded as such. This behavior is not JSON specification compliant,
8.124 + but is consistent with most JavaScript based encoders and decoders.
8.125 + Otherwise, it will be a ValueError to encode such floats.
8.126 +
8.127 + If sort_keys is true, then the output of dictionaries will be
8.128 + sorted by key; this is useful for regression tests to ensure
8.129 + that JSON serializations can be compared on a day-to-day basis.
8.130 +
8.131 + If indent is a non-negative integer, then JSON array
8.132 + elements and object members will be pretty-printed with that
8.133 + indent level. An indent level of 0 will only insert newlines.
8.134 + None is the most compact representation.
8.135 +
8.136 + If specified, separators should be a (item_separator, key_separator)
8.137 + tuple. The default is (', ', ': '). To get the most compact JSON
8.138 + representation you should specify (',', ':') to eliminate whitespace.
8.139 +
8.140 + If specified, default is a function that gets called for objects
8.141 + that can't otherwise be serialized. It should return a JSON encodable
8.142 + version of the object or raise a ``TypeError``.
8.143 +
8.144 + If encoding is not None, then all input strings will be
8.145 + transformed into unicode using that encoding prior to JSON-encoding.
8.146 + The default is UTF-8.
8.147 +
8.148 + """
8.149 +
8.150 + self.skipkeys = skipkeys
8.151 + self.ensure_ascii = ensure_ascii
8.152 + self.check_circular = check_circular
8.153 + self.allow_nan = allow_nan
8.154 + self.sort_keys = sort_keys
8.155 + self.indent = indent
8.156 + if separators is not None:
8.157 + self.item_separator, self.key_separator = separators
8.158 + if default is not None:
8.159 + self.default = default
8.160 + self.encoding = encoding
8.161 +
8.162 + def default(self, o):
8.163 + """Implement this method in a subclass such that it returns
8.164 + a serializable object for ``o``, or calls the base implementation
8.165 + (to raise a ``TypeError``).
8.166 +
8.167 + For example, to support arbitrary iterators, you could
8.168 + implement default like this::
8.169 +
8.170 + def default(self, o):
8.171 + try:
8.172 + iterable = iter(o)
8.173 + except TypeError:
8.174 + pass
8.175 + else:
8.176 + return list(iterable)
8.177 + return JSONEncoder.default(self, o)
8.178 +
8.179 + """
8.180 + raise TypeError(repr(o) + " is not JSON serializable")
8.181 +
8.182 + def encode(self, o):
8.183 + """Return a JSON string representation of a Python data structure.
8.184 +
8.185 + >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
8.186 + '{"foo": ["bar", "baz"]}'
8.187 +
8.188 + """
8.189 + # This is for extremely simple cases and benchmarks.
8.190 + if isinstance(o, basestring):
8.191 + if isinstance(o, str):
8.192 + _encoding = self.encoding
8.193 + if (_encoding is not None
8.194 + and not (_encoding == 'utf-8')):
8.195 + o = o.decode(_encoding)
8.196 + if self.ensure_ascii:
8.197 + return encode_basestring_ascii(o)
8.198 + else:
8.199 + return encode_basestring(o)
8.200 + # This doesn't pass the iterator directly to ''.join() because the
8.201 + # exceptions aren't as detailed. The list call should be roughly
8.202 + # equivalent to the PySequence_Fast that ''.join() would do.
8.203 + chunks = self.iterencode(o, _one_shot=True)
8.204 + if not isinstance(chunks, (list, tuple)):
8.205 + chunks = list(chunks)
8.206 + return ''.join(chunks)
8.207 +
8.208 + def iterencode(self, o, _one_shot=False):
8.209 + """Encode the given object and yield each string
8.210 + representation as available.
8.211 +
8.212 + For example::
8.213 +
8.214 + for chunk in JSONEncoder().iterencode(bigobject):
8.215 + mysocket.write(chunk)
8.216 +
8.217 + """
8.218 + if self.check_circular:
8.219 + markers = {}
8.220 + else:
8.221 + markers = None
8.222 + if self.ensure_ascii:
8.223 + _encoder = encode_basestring_ascii
8.224 + else:
8.225 + _encoder = encode_basestring
8.226 + if self.encoding != 'utf-8':
8.227 + def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
8.228 + if isinstance(o, str):
8.229 + o = o.decode(_encoding)
8.230 + return _orig_encoder(o)
8.231 +
8.232 + def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
8.233 + # Check for specials. Note that this type of test is processor- and/or
8.234 + # platform-specific, so do tests which don't depend on the internals.
8.235 +
8.236 + if o != o:
8.237 + text = 'NaN'
8.238 + elif o == _inf:
8.239 + text = 'Infinity'
8.240 + elif o == _neginf:
8.241 + text = '-Infinity'
8.242 + else:
8.243 + return _repr(o)
8.244 +
8.245 + if not allow_nan:
8.246 + raise ValueError(
8.247 + "Out of range float values are not JSON compliant: " +
8.248 + repr(o))
8.249 +
8.250 + return text
8.251 +
8.252 +
8.253 + if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys:
8.254 + _iterencode = c_make_encoder(
8.255 + markers, self.default, _encoder, self.indent,
8.256 + self.key_separator, self.item_separator, self.sort_keys,
8.257 + self.skipkeys, self.allow_nan)
8.258 + else:
8.259 + _iterencode = _make_iterencode(
8.260 + markers, self.default, _encoder, self.indent, floatstr,
8.261 + self.key_separator, self.item_separator, self.sort_keys,
8.262 + self.skipkeys, _one_shot)
8.263 + return _iterencode(o, 0)
8.264 +
8.265 +def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
8.266 + ## HACK: hand-optimized bytecode; turn globals into locals
8.267 + False=False,
8.268 + True=True,
8.269 + ValueError=ValueError,
8.270 + basestring=basestring,
8.271 + dict=dict,
8.272 + float=float,
8.273 + id=id,
8.274 + int=int,
8.275 + isinstance=isinstance,
8.276 + list=list,
8.277 + long=long,
8.278 + str=str,
8.279 + tuple=tuple,
8.280 + ):
8.281 +
8.282 + def _iterencode_list(lst, _current_indent_level):
8.283 + if not lst:
8.284 + yield '[]'
8.285 + return
8.286 + if markers is not None:
8.287 + markerid = id(lst)
8.288 + if markerid in markers:
8.289 + raise ValueError("Circular reference detected")
8.290 + markers[markerid] = lst
8.291 + buf = '['
8.292 + if _indent is not None:
8.293 + _current_indent_level += 1
8.294 + newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
8.295 + separator = _item_separator + newline_indent
8.296 + buf += newline_indent
8.297 + else:
8.298 + newline_indent = None
8.299 + separator = _item_separator
8.300 + first = True
8.301 + for value in lst:
8.302 + if first:
8.303 + first = False
8.304 + else:
8.305 + buf = separator
8.306 + if isinstance(value, basestring):
8.307 + yield buf + _encoder(value)
8.308 + elif value is None:
8.309 + yield buf + 'null'
8.310 + elif value is True:
8.311 + yield buf + 'true'
8.312 + elif value is False:
8.313 + yield buf + 'false'
8.314 + elif isinstance(value, (int, long)):
8.315 + yield buf + str(value)
8.316 + elif isinstance(value, float):
8.317 + yield buf + _floatstr(value)
8.318 + else:
8.319 + yield buf
8.320 + if isinstance(value, (list, tuple)):
8.321 + chunks = _iterencode_list(value, _current_indent_level)
8.322 + elif isinstance(value, dict):
8.323 + chunks = _iterencode_dict(value, _current_indent_level)
8.324 + else:
8.325 + chunks = _iterencode(value, _current_indent_level)
8.326 + for chunk in chunks:
8.327 + yield chunk
8.328 + if newline_indent is not None:
8.329 + _current_indent_level -= 1
8.330 + yield '\n' + (' ' * (_indent * _current_indent_level))
8.331 + yield ']'
8.332 + if markers is not None:
8.333 + del markers[markerid]
8.334 +
8.335 + def _iterencode_dict(dct, _current_indent_level):
8.336 + if not dct:
8.337 + yield '{}'
8.338 + return
8.339 + if markers is not None:
8.340 + markerid = id(dct)
8.341 + if markerid in markers:
8.342 + raise ValueError("Circular reference detected")
8.343 + markers[markerid] = dct
8.344 + yield '{'
8.345 + if _indent is not None:
8.346 + _current_indent_level += 1
8.347 + newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
8.348 + item_separator = _item_separator + newline_indent
8.349 + yield newline_indent
8.350 + else:
8.351 + newline_indent = None
8.352 + item_separator = _item_separator
8.353 + first = True
8.354 + if _sort_keys:
8.355 + items = dct.items()
8.356 + items.sort(key=lambda kv: kv[0])
8.357 + else:
8.358 + items = dct.iteritems()
8.359 + for key, value in items:
8.360 + if isinstance(key, basestring):
8.361 + pass
8.362 + # JavaScript is weakly typed for these, so it makes sense to
8.363 + # also allow them. Many encoders seem to do something like this.
8.364 + elif isinstance(key, float):
8.365 + key = _floatstr(key)
8.366 + elif key is True:
8.367 + key = 'true'
8.368 + elif key is False:
8.369 + key = 'false'
8.370 + elif key is None:
8.371 + key = 'null'
8.372 + elif isinstance(key, (int, long)):
8.373 + key = str(key)
8.374 + elif _skipkeys:
8.375 + continue
8.376 + else:
8.377 + raise TypeError("key " + repr(key) + " is not a string")
8.378 + if first:
8.379 + first = False
8.380 + else:
8.381 + yield item_separator
8.382 + yield _encoder(key)
8.383 + yield _key_separator
8.384 + if isinstance(value, basestring):
8.385 + yield _encoder(value)
8.386 + elif value is None:
8.387 + yield 'null'
8.388 + elif value is True:
8.389 + yield 'true'
8.390 + elif value is False:
8.391 + yield 'false'
8.392 + elif isinstance(value, (int, long)):
8.393 + yield str(value)
8.394 + elif isinstance(value, float):
8.395 + yield _floatstr(value)
8.396 + else:
8.397 + if isinstance(value, (list, tuple)):
8.398 + chunks = _iterencode_list(value, _current_indent_level)
8.399 + elif isinstance(value, dict):
8.400 + chunks = _iterencode_dict(value, _current_indent_level)
8.401 + else:
8.402 + chunks = _iterencode(value, _current_indent_level)
8.403 + for chunk in chunks:
8.404 + yield chunk
8.405 + if newline_indent is not None:
8.406 + _current_indent_level -= 1
8.407 + yield '\n' + (' ' * (_indent * _current_indent_level))
8.408 + yield '}'
8.409 + if markers is not None:
8.410 + del markers[markerid]
8.411 +
8.412 + def _iterencode(o, _current_indent_level):
8.413 + if isinstance(o, basestring):
8.414 + yield _encoder(o)
8.415 + elif o is None:
8.416 + yield 'null'
8.417 + elif o is True:
8.418 + yield 'true'
8.419 + elif o is False:
8.420 + yield 'false'
8.421 + elif isinstance(o, (int, long)):
8.422 + yield str(o)
8.423 + elif isinstance(o, float):
8.424 + yield _floatstr(o)
8.425 + elif isinstance(o, (list, tuple)):
8.426 + for chunk in _iterencode_list(o, _current_indent_level):
8.427 + yield chunk
8.428 + elif isinstance(o, dict):
8.429 + for chunk in _iterencode_dict(o, _current_indent_level):
8.430 + yield chunk
8.431 + else:
8.432 + if markers is not None:
8.433 + markerid = id(o)
8.434 + if markerid in markers:
8.435 + raise ValueError("Circular reference detected")
8.436 + markers[markerid] = o
8.437 + o = _default(o)
8.438 + for chunk in _iterencode(o, _current_indent_level):
8.439 + yield chunk
8.440 + if markers is not None:
8.441 + del markers[markerid]
8.442 +
8.443 + return _iterencode
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
9.2 +++ b/trunk/quahog/plugins/Weather/local/simplejson/scanner.py Thu Oct 22 10:14:56 2009 -0400
9.3 @@ -0,0 +1,65 @@
9.4 +"""JSON token scanner
9.5 +"""
9.6 +import re
9.7 +try:
9.8 + from _speedups import make_scanner as c_make_scanner
9.9 +except ImportError:
9.10 + c_make_scanner = None
9.11 +
9.12 +__all__ = ['make_scanner']
9.13 +
9.14 +NUMBER_RE = re.compile(
9.15 + r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
9.16 + (re.VERBOSE | re.MULTILINE | re.DOTALL))
9.17 +
9.18 +def py_make_scanner(context):
9.19 + parse_object = context.parse_object
9.20 + parse_array = context.parse_array
9.21 + parse_string = context.parse_string
9.22 + match_number = NUMBER_RE.match
9.23 + encoding = context.encoding
9.24 + strict = context.strict
9.25 + parse_float = context.parse_float
9.26 + parse_int = context.parse_int
9.27 + parse_constant = context.parse_constant
9.28 + object_hook = context.object_hook
9.29 +
9.30 + def _scan_once(string, idx):
9.31 + try:
9.32 + nextchar = string[idx]
9.33 + except IndexError:
9.34 + raise StopIteration
9.35 +
9.36 + if nextchar == '"':
9.37 + return parse_string(string, idx + 1, encoding, strict)
9.38 + elif nextchar == '{':
9.39 + return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook)
9.40 + elif nextchar == '[':
9.41 + return parse_array((string, idx + 1), _scan_once)
9.42 + elif nextchar == 'n' and string[idx:idx + 4] == 'null':
9.43 + return None, idx + 4
9.44 + elif nextchar == 't' and string[idx:idx + 4] == 'true':
9.45 + return True, idx + 4
9.46 + elif nextchar == 'f' and string[idx:idx + 5] == 'false':
9.47 + return False, idx + 5
9.48 +
9.49 + m = match_number(string, idx)
9.50 + if m is not None:
9.51 + integer, frac, exp = m.groups()
9.52 + if frac or exp:
9.53 + res = parse_float(integer + (frac or '') + (exp or ''))
9.54 + else:
9.55 + res = parse_int(integer)
9.56 + return res, m.end()
9.57 + elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
9.58 + return parse_constant('NaN'), idx + 3
9.59 + elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
9.60 + return parse_constant('Infinity'), idx + 8
9.61 + elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
9.62 + return parse_constant('-Infinity'), idx + 9
9.63 + else:
9.64 + raise StopIteration
9.65 +
9.66 + return _scan_once
9.67 +
9.68 +make_scanner = c_make_scanner or py_make_scanner
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
10.2 +++ b/trunk/quahog/plugins/Weather/local/simplejson/tool.py Thu Oct 22 10:14:56 2009 -0400
10.3 @@ -0,0 +1,37 @@
10.4 +r"""Command-line tool to validate and pretty-print JSON
10.5 +
10.6 +Usage::
10.7 +
10.8 + $ echo '{"json":"obj"}' | python -m simplejson.tool
10.9 + {
10.10 + "json": "obj"
10.11 + }
10.12 + $ echo '{ 1.2:3.4}' | python -m simplejson.tool
10.13 + Expecting property name: line 1 column 2 (char 2)
10.14 +
10.15 +"""
10.16 +import sys
10.17 +import simplejson
10.18 +
10.19 +def main():
10.20 + if len(sys.argv) == 1:
10.21 + infile = sys.stdin
10.22 + outfile = sys.stdout
10.23 + elif len(sys.argv) == 2:
10.24 + infile = open(sys.argv[1], 'rb')
10.25 + outfile = sys.stdout
10.26 + elif len(sys.argv) == 3:
10.27 + infile = open(sys.argv[1], 'rb')
10.28 + outfile = open(sys.argv[2], 'wb')
10.29 + else:
10.30 + raise SystemExit(sys.argv[0] + " [infile [outfile]]")
10.31 + try:
10.32 + obj = simplejson.load(infile)
10.33 + except ValueError, e:
10.34 + raise SystemExit(e)
10.35 + simplejson.dump(obj, outfile, sort_keys=True, indent=4)
10.36 + outfile.write('\n')
10.37 +
10.38 +
10.39 +if __name__ == '__main__':
10.40 + main()
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
11.2 +++ b/trunk/quahog/plugins/Weather/plugin.py Thu Oct 22 10:14:56 2009 -0400
11.3 @@ -0,0 +1,487 @@
11.4 +###
11.5 +# Copyright (c) 2005,2009, James Vega
11.6 +# All rights reserved.
11.7 +#
11.8 +# Redistribution and use in source and binary forms, with or without
11.9 +# modification, are permitted provided that the following conditions are met:
11.10 +#
11.11 +# * Redistributions of source code must retain the above copyright notice,
11.12 +# this list of conditions, and the following disclaimer.
11.13 +# * Redistributions in binary form must reproduce the above copyright notice,
11.14 +# this list of conditions, and the following disclaimer in the
11.15 +# documentation and/or other materials provided with the distribution.
11.16 +# * Neither the name of the author of this software nor the name of
11.17 +# contributors to this software may be used to endorse or promote products
11.18 +# derived from this software without specific prior written consent.
11.19 +#
11.20 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
11.21 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
11.22 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
11.23 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
11.24 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
11.25 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
11.26 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
11.27 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
11.28 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
11.29 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
11.30 +# POSSIBILITY OF SUCH DAMAGE.
11.31 +###
11.32 +
11.33 +import re
11.34 +
11.35 +# Specifically use our local copy since later versions changed their interface
11.36 +# and (depending on the version) don't work as well
11.37 +from local import BeautifulSoup
11.38 +
11.39 +import supybot.conf as conf
11.40 +import supybot.utils as utils
11.41 +from supybot.commands import *
11.42 +import supybot.ircutils as ircutils
11.43 +import supybot.callbacks as callbacks
11.44 +
11.45 +try:
11.46 + feedparser = utils.python.universalImport('feedparser', 'local.feedparser')
11.47 +except ImportError:
11.48 + raise callbacks.Error, \
11.49 + 'You need the feedparser module installed to use this plugin. ' \
11.50 + 'Download the module at <http://www.feedparser.org/>.'
11.51 +
11.52 +simplejson = None
11.53 +
11.54 +try:
11.55 + import json as simplejson
11.56 +except ImportError:
11.57 + pass
11.58 +
11.59 +try:
11.60 + # The 3rd party simplejson module was included in Python 2.6 and renamed to
11.61 + # json. Unfortunately, this conflicts with the 3rd party json module.
11.62 + # Luckily, the 3rd party json module has a different interface so we test
11.63 + # to make sure we aren't using it.
11.64 + if simplejson is None or hasattr(simplejson, 'read'):
11.65 + simplejson = utils.python.universalImport('simplejson',
11.66 + 'local.simplejson')
11.67 +except ImportError:
11.68 + raise callbacks.Error, \
11.69 + 'You need Python2.6 or the simplejson module installed to use ' \
11.70 + 'this plugin. Download the module at ' \
11.71 + '<http://undefined.org/python/#simplejson>.'
11.72 +
11.73 +unitAbbrevs = utils.abbrev(['Fahrenheit', 'Celsius', 'Centigrade', 'Kelvin'])
11.74 +unitAbbrevs['C'] = 'Celsius'
11.75 +unitAbbrevs['Ce'] = 'Celsius'
11.76 +
11.77 +noLocationError = 'No such location could be found.'
11.78 +class NoLocation(callbacks.Error):
11.79 + pass
11.80 +
11.81 +class Weather(callbacks.Plugin):
11.82 + weatherCommands = ('wunder', 'wunder rss', 'cnn', 'ham')
11.83 + threaded = True
11.84 + def callCommand(self, method, irc, msg, *args, **kwargs):
11.85 + try:
11.86 + super(Weather, self).callCommand(method, irc, msg, *args, **kwargs)
11.87 + except utils.web.Error, e:
11.88 + irc.error(str(e))
11.89 +
11.90 + def _noLocation():
11.91 + raise NoLocation, noLocationError
11.92 + _noLocation = staticmethod(_noLocation)
11.93 +
11.94 + def weather(self, irc, msg, args, location):
11.95 + """<US zip code | US/Canada city, state | Foreign city, country>
11.96 +
11.97 + Returns the approximate weather conditions for a given city.
11.98 + """
11.99 + channel = None
11.100 + if irc.isChannel(msg.args[0]):
11.101 + channel = msg.args[0]
11.102 + if not location:
11.103 + location = self.userValue('lastLocation', msg.prefix)
11.104 + if not location:
11.105 + raise callbacks.ArgumentError
11.106 + self.setUserValue('lastLocation', msg.prefix,
11.107 + location, ignoreNoUser=True)
11.108 + args = [location]
11.109 + commandName = self.registryValue('command', channel)
11.110 + firstCommand = commandName
11.111 + command = self.getCommandMethod(commandName.split())
11.112 + try:
11.113 + command(irc, msg, args[:])
11.114 + except (NoLocation, utils.web.Error):
11.115 + self.log.info('%s lookup failed, Trying others.', firstCommand)
11.116 + for commandName in self.weatherCommands:
11.117 + if commandName != firstCommand:
11.118 + self.log.info('Trying %s.', commandName)
11.119 + try:
11.120 + command = self.getCommandMethod(commandName.split())
11.121 + command(irc, msg, args[:])
11.122 + self.log.info('%s lookup succeeded.', commandName)
11.123 + return
11.124 + except NoLocation:
11.125 + self.log.info('%s lookup failed as backup.',
11.126 + commandName)
11.127 + irc.error(format('Could not retrieve weather for %q.', location))
11.128 + weather = wrap(weather, [additional('text')])
11.129 +
11.130 + def _toCelsius(temp, unit):
11.131 + if unit == 'K':
11.132 + return temp - 273.15
11.133 + elif unit == 'F':
11.134 + return (temp - 32) * 5 /9
11.135 + else:
11.136 + return temp
11.137 + _toCelsius = staticmethod(_toCelsius)
11.138 +
11.139 + _temp = re.compile(r'(-?\d+)(.*?)(F|C)')
11.140 + def _getTemp(temp, deg, unit, chan):
11.141 + assert unit == unit.upper()
11.142 + assert temp == float(temp)
11.143 + default = conf.get(conf.supybot.plugins.Weather.temperatureUnit, chan)
11.144 + convert = conf.get(conf.supybot.plugins.Weather.convert, chan)
11.145 + # Short circuit if we're the same unit as the default or no conversion
11.146 + # has been requested
11.147 + if unitAbbrevs[unit] == default or not convert:
11.148 + return format('%0.1f%s%s', temp, deg, unit)
11.149 + temp = Weather._toCelsius(temp, unit)
11.150 + unit = 'C'
11.151 + if default == 'Kelvin':
11.152 + temp = temp + 273.15
11.153 + unit = 'K'
11.154 + deg = ' '
11.155 + elif default == 'Fahrenheit':
11.156 + temp = temp * 9 / 5 + 32
11.157 + unit = 'F'
11.158 + return '%0.1f%s%s' % (temp, deg, unit)
11.159 + _getTemp = staticmethod(_getTemp)
11.160 +
11.161 + _hamLoc = re.compile(
11.162 + r'<span class="Place">([^,]+), ([^,\n]+),(.*?)</span>', re.I)
11.163 + _interregex = re.compile(
11.164 + r'<span class="Place">([^,]+), ([^,\n]+?)</span>', re.I)
11.165 + _hamCond = re.compile(
11.166 + r'<td width="100%" colspan="2" align="center" class="Wx">([^<]+)</td>',
11.167 + re.I)
11.168 + _hamTemp = re.compile(
11.169 + r'<td valign="top" align="right" class="Temp">(-?\d+)(.*?)(F|C)</td>',
11.170 + re.I)
11.171 + _hamChill = re.compile(
11.172 + r'Wind Chill:</td>\s+<td align="right" class="Value">([^N][^<]+)</td>',
11.173 + re.I | re.S)
11.174 + _hamHeat = re.compile(
11.175 + r'Heat Index:</td>\s+<td align="right" class="Value">([^N][^<]+)</td>',
11.176 + re.I | re.S)
11.177 + _hamMultiLoc = re.compile(
11.178 + r'Select from one of[^<]+</b></font></td></tr>\s*<tr><td><font[^>]+>'
11.179 + r'\s*<a href="(/cgi-bin/hw3[^"]+)">', re.I | re.S)
11.180 + def ham(self, irc, msg, args, loc):
11.181 + """<US zip code | US/Canada city, state | Foreign city, country>
11.182 +
11.183 + Returns the approximate weather conditions for a given city.
11.184 + """
11.185 + url = 'http://www.hamweather.net/cgi-bin/hw3/hw3.cgi?' \
11.186 + 'config=&forecast=zandh&pands=%s&Submit=GO' % \
11.187 + utils.web.urlquote(loc.lower())
11.188 + html = utils.web.getUrl(url)
11.189 + if 'was not found' in html:
11.190 + self._noLocation()
11.191 +
11.192 + # ham seems to automatically return a location for duplicate names with
11.193 + # no list of other possibilities anymore, so this code may not be
11.194 + # needed
11.195 + if 'Multiple Locations for' in html:
11.196 + m = self._hamMultiLoc.search(html)
11.197 + if m:
11.198 + url = 'http://www.hamweather.net/%s' % m.group(1)
11.199 + html = utils.web.getUrl(url)
11.200 + else:
11.201 + self._noLocation()
11.202 + headData = self._hamLoc.search(html)
11.203 + if headData is not None:
11.204 + (city, state, country) = headData.groups()
11.205 + else:
11.206 + headData = self._interregex.search(html)
11.207 + if headData:
11.208 + (city, state) = headData.groups()
11.209 + else:
11.210 + self._noLocation()
11.211 + city = utils.web.htmlToText(city.strip())
11.212 + state = utils.web.htmlToText(state.strip())
11.213 + temp = self._hamTemp.search(html)
11.214 + if temp is not None:
11.215 + (temp, deg, unit) = temp.groups()
11.216 + deg = utils.web.htmlToText(deg)
11.217 + temp = self._getTemp(float(temp), deg, unit, msg.args[0])
11.218 + conds = self._hamCond.search(html)
11.219 + if conds is not None:
11.220 + conds = conds.group(1)
11.221 + index = ''
11.222 + chill = self._hamChill.search(html)
11.223 + if chill is not None:
11.224 + chill = chill.group(1)
11.225 + chill = utils.web.htmlToText(chill)
11.226 + tempsplit = self._temp.search(chill)
11.227 + if tempsplit:
11.228 + (chill, deg, unit) = tempsplit.groups()
11.229 + chill = self._getTemp(float(chill), deg, unit,msg.args[0])
11.230 + if float(chill[:-2]) < float(temp[:-2]):
11.231 + index = format(' (Wind Chill: %s)', chill)
11.232 + heat = self._hamHeat.search(html)
11.233 + if heat is not None:
11.234 + heat = heat.group(1)
11.235 + heat = utils.web.htmlToText(heat)
11.236 + tempsplit = self._temp.search(heat)
11.237 + if tempsplit:
11.238 + (heat, deg, unit) = tempsplit.groups()
11.239 + heat= self._getTemp(float(heat), deg, unit,msg.args[0])
11.240 + if float(heat[:-2]) > float(temp[:-2]):
11.241 + index = format(' (Heat Index: %s)', heat)
11.242 + if temp and conds and city and state:
11.243 + conds = conds.replace('Tsra', 'Thunderstorms')
11.244 + conds = conds.replace('Ts', 'Thunderstorms')
11.245 + s = format('The current temperature in %s, %s is %s%s. '
11.246 + 'Conditions: %s.',
11.247 + city, state, temp, index, conds)
11.248 + irc.reply(s.decode('latin1').encode('utf-8'))
11.249 + else:
11.250 + irc.errorPossibleBug('The format of the page was odd.')
11.251 + ham = wrap(ham, ['text'])
11.252 +
11.253 + _cnnSearchUrl = 'http://weather.cnn.com/weather/citySearch?' \
11.254 + 'search_term=%s&mode=json&filter=true'
11.255 + _cnnUrl='http://weather.cnn.com/weather/forecast.jsp?locCode=%s&zipCode=%s'
11.256 + _cnnFTemp = re.compile(r'<div class="cnnWeatherTempCurrent">' \
11.257 + r'(-?\d+)(°)</div>',
11.258 + re.I | re.S)
11.259 + _cnnCond = re.compile(r'<span class="cnnWeatherConditionCurrent">' \
11.260 + r'([^<]+)</span>',
11.261 + re.I | re.S)
11.262 + _cnnHumid = re.compile(r'Humidity: </b>(\d+%)', re.I | re.S)
11.263 + _cnnWind = re.compile(r'Wind: </b>([^<\n\r]+)', re.I | re.S)
11.264 + # Certain countries are expected to use a standard abbreviation
11.265 + # The weather we pull uses weird codes. Map obvious ones here.
11.266 + _cnnCountryMap = {'uk': 'en', 'de': 'ge'}
11.267 + def cnn(self, irc, msg, args, loc):
11.268 + """<US zip code | US/Canada city, state | Foreign city, country>
11.269 +
11.270 + Returns the approximate weather conditions for a given city.
11.271 + """
11.272 + if ' ' in loc:
11.273 + #If we received more than 1 argument, then we got a city with a
11.274 + #multi-word name. ie ['Garden', 'City', 'KS'] instead of
11.275 + #['Liberal', 'KS'].
11.276 + loc = utils.str.rsplit(loc, None, 1)
11.277 + state = loc.pop().lower()
11.278 + city = ' '.join(loc)
11.279 + city = city.rstrip(',').lower()
11.280 + if state in self._cnnCountryMap:
11.281 + state = self._cnnCountryMap[state]
11.282 + loc = ' '.join([city, state])
11.283 + else:
11.284 + #We received a single argument. Zipcode or station id.
11.285 + loc = loc.replace(',', '')
11.286 + url = self._cnnSearchUrl % (utils.web.urlquote(loc))
11.287 + json = simplejson.loads(utils.web.getUrl(url))
11.288 + if not json:
11.289 + self._noLocation()
11.290 + json = json[0]
11.291 + url = self._cnnUrl % (json['locCode'], json['zip'])
11.292 + text = utils.web.getUrl(url)
11.293 + location = ', '.join([json['city'], json['stateOrCountry']])
11.294 + temp = self._cnnFTemp.search(text)
11.295 + conds = self._cnnCond.search(text)
11.296 + humidity = self._cnnHumid.search(text)
11.297 + wind = self._cnnWind.search(text)
11.298 + if location and temp:
11.299 + (temp, deg) = temp.groups()
11.300 + unit = 'F'
11.301 + temp = self._getTemp(float(temp), deg, unit, msg.args[0])
11.302 + resp = [format('The current temperature in %s is %s.',
11.303 + location, temp)]
11.304 + if conds is not None:
11.305 + resp.append(format('Conditions: %s.', conds.group(1)))
11.306 + if humidity is not None:
11.307 + resp.append(format('Humidity: %s.', humidity.group(1)))
11.308 + if wind is not None:
11.309 + resp.append(format('Wind: %s.', wind.group(1)))
11.310 + resp = map(utils.web.htmlToText, resp)
11.311 + irc.reply(' '.join(resp))
11.312 + else:
11.313 + irc.errorPossibleBug('Could not find weather information.')
11.314 + cnn = wrap(cnn, ['text'])
11.315 +
11.316 + class wunder(callbacks.Commands):
11.317 + _backupUrl = re.compile(r'<a href="(/global/stations[^"]+)">')
11.318 +
11.319 + _wunderUrl = 'http://mobile.wunderground.com/cgi-bin/' \
11.320 + 'findweather/getForecast?query='
11.321 + _wunderSevere = re.compile(r'font color="?#ff0000"?>([^<]+)<', re.I)
11.322 + _wunderMultiLoc = re.compile(r'<a href="([^"]+)', re.I | re.S)
11.323 + def wunder(self, irc, msg, args, loc):
11.324 + """<US zip code | US/Canada city, state | Foreign city, country>
11.325 +
11.326 + Returns the approximate weather conditions for a given city.
11.327 + """
11.328 + url = '%s%s' % (self._wunderUrl, utils.web.urlquote(loc))
11.329 + text = utils.web.getUrl(url)
11.330 + if 'Search not found' in text or \
11.331 + re.search(r'size="2"> Place </font>', text, re.I):
11.332 + Weather._noLocation()
11.333 + if 'Place: Temperature' in text:
11.334 + m = self._backupUrl.search(text)
11.335 + if m is not None:
11.336 + url = 'http://mobile.wunderground.com' + m.group(1)
11.337 + text = utils.web.getUrl(url)
11.338 + severe = ''
11.339 + m = self._wunderSevere.search(text)
11.340 + if m:
11.341 + severe = ircutils.bold(format(' %s', m.group(1)))
11.342 + text = self._formatSymbols(text)
11.343 + soup = BeautifulSoup.BeautifulSoup()
11.344 + soup.feed(text)
11.345 + # Get the table with all the weather info
11.346 + table = soup.first('table', {'border':'1'})
11.347 + if not table:
11.348 + Weather._noLocation()
11.349 + trs = table.fetch('tr')
11.350 + (time, location) = trs.pop(0).fetch('b')
11.351 + time = time.string
11.352 + location = location.string
11.353 + info = {}
11.354 + def isText(t):
11.355 + return not isinstance(t, BeautifulSoup.NavigableText) \
11.356 + and t.contents
11.357 + def getText(t):
11.358 + s = t.string
11.359 + if s is BeautifulSoup.Null:
11.360 + t = t.contents
11.361 + num = t[0].string
11.362 + units = t[1].string
11.363 + # htmlToText strips leading whitespace, so we have to
11.364 + # handle strings with differently.
11.365 + if units.startswith(' '):
11.366 + units = utils.web.htmlToText(units)
11.367 + s = ' '.join((num, units))
11.368 + else:
11.369 + units = utils.web.htmlToText(units)
11.370 + s = ' '.join((num, units[0], units[1:]))
11.371 + return s
11.372 + for tr in trs:
11.373 + k = tr.td.string
11.374 + v = filter(isText, tr.fetch('td')[1].contents)
11.375 + value = map(getText, v)
11.376 + info[k] = ' '.join(value)
11.377 + temp = info['Temperature']
11.378 + if location and temp:
11.379 + (temp, deg, unit) = temp.split()[3:] # We only want temp format
11.380 + temp = Weather._getTemp(float(temp), deg, unit, msg.args[0])
11.381 + resp = ['The current temperature in %s is %s (%s).' %\
11.382 + (location, temp, time)]
11.383 + conds = info['Conditions']
11.384 + resp.append('Conditions: %s.' % info['Conditions'])
11.385 + humidity = info['Humidity']
11.386 + resp.append('Humidity: %s.' % info['Humidity'])
11.387 + # Apparently, the "Dew Point" and "Wind" categories are
11.388 + # occasionally set to "-" instead of an actual reading. So,
11.389 + # we'll just catch the ValueError from trying to unpack a tuple
11.390 + # of the wrong size.
11.391 + try:
11.392 + (dew, deg, unit) = info['Dew Point'].split()[3:]
11.393 + dew = Weather._getTemp(float(dew), deg, unit, msg.args[0])
11.394 + resp.append('Dew Point: %s.' % dew)
11.395 + except (ValueError, KeyError):
11.396 + pass
11.397 + try:
11.398 + wind = 'Wind: %s at %s %s.' % tuple(info['Wind'].split())
11.399 + resp.append(wind)
11.400 + except (ValueError, TypeError):
11.401 + pass
11.402 + try:
11.403 + (chill, deg, unit) = info['Windchill'].split()[3:]
11.404 + chill = Weather._getTemp(float(chill), deg,
11.405 + unit, msg.args[0])
11.406 + resp.append('Windchill: %s.' % chill)
11.407 + except (ValueError, KeyError):
11.408 + pass
11.409 + if info['Pressure']:
11.410 + resp.append('Pressure: %s.' % info['Pressure'])
11.411 + resp.append(severe)
11.412 + resp = map(utils.web.htmlToText, resp)
11.413 + irc.reply(' '.join(resp).decode('latin1').encode('utf-8'))
11.414 + else:
11.415 + Weather._noLocation()
11.416 + wunder = wrap(wunder, ['text'])
11.417 +
11.418 + _rsswunderUrl = 'http://www.wunderground.com/cgi-bin/findweather/' \
11.419 + 'getForecast?query=%s'
11.420 + _rsswunderfeed = re.compile(
11.421 + r'<link rel="alternate".*href="([^"]+)" */?>', re.I)
11.422 + _rsswunderSevere = re.compile(
11.423 + r'font color="?#ff0000"?><b>([^<]+)<', re.I)
11.424 + _rsswunderLocation = re.compile(
11.425 + r'<title>(?:(.*) Weather from Weather Underground|'
11.426 + r'Weather Underground - (.*))</title>', re.I)
11.427 + _rsswunderForecastDate = re.compile(r'Forecast for (.*) as of', re.I)
11.428 + def rss(self, irc, msg, args, loc):
11.429 + """<US zip code | US/Canada city, state | Foreign city, country>
11.430 +
11.431 + Returns the approximate weather conditions for a given city.
11.432 + """
11.433 + url = self._rsswunderUrl % utils.web.urlquote(loc)
11.434 + url = url.replace('%20', '+')
11.435 + text = utils.web.getUrl(url)
11.436 + if 'Search not found' in text or \
11.437 + re.search(r'size="2"> Place </font>', text, re.I):
11.438 + Weather._noLocation()
11.439 + if 'Search Results' in text:
11.440 + m = self._backupUrl.search(text)
11.441 + if m is not None:
11.442 + url = 'http://www.wunderground.com' + m.group(1)
11.443 + text = utils.web.getUrl(url)
11.444 + else:
11.445 + Weather._noLocation()
11.446 + self._rss(irc, text)
11.447 + rss = wrap(rss, ['text'])
11.448 +
11.449 + def _rss(self, irc, text):
11.450 + severe = None
11.451 + m = self._rsswunderSevere.search(text)
11.452 + if m:
11.453 + severe = ircutils.bold(m.group(1))
11.454 + feed = self._rsswunderfeed.search(text)
11.455 + if not feed:
11.456 + Weather._noLocation()
11.457 + feed = feed.group(1)
11.458 + rss = utils.web.getUrl(feed)
11.459 + rss = self._formatSymbols(rss)
11.460 + rss = rss.replace(":", ": ")
11.461 + rss = rss.replace(": ", ": ")
11.462 + resp = []
11.463 + location = self._rsswunderLocation.search(rss)
11.464 + if location is not None:
11.465 + title = filter(None, location.groups())
11.466 + if title:
11.467 + resp.append('Weather for %s' % title[0])
11.468 + info = feedparser.parse(rss)
11.469 + for e in info['entries']:
11.470 + d = self._rsswunderForecastDate.search(e['title'])
11.471 + if d is not None:
11.472 + resp.append(d.group(1) + ' - Conditions: ' + e['summary'])
11.473 + else:
11.474 + resp.append(e['summary'])
11.475 + resp = [s.encode('utf-8').rtrim('.') for s in resp]
11.476 + if severe is not None:
11.477 + resp.append(severe)
11.478 + irc.reply(utils.web.htmlToText('; '.join(resp)))
11.479 +
11.480 + def _formatSymbols(self, text):
11.481 + text = text.replace("&", "&")
11.482 + text = text.replace("°", "°")
11.483 + text = text.replace(" ° ", "°")
11.484 + text = text.replace("°", "\xb0")
11.485 + return text
11.486 +
11.487 +Class = Weather
11.488 +
11.489 +
11.490 +# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
12.2 +++ b/trunk/quahog/plugins/Weather/test.py Thu Oct 22 10:14:56 2009 -0400
12.3 @@ -0,0 +1,110 @@
12.4 +###
12.5 +# Copyright (c) 2005,2009, James Vega
12.6 +# All rights reserved.
12.7 +#
12.8 +# Redistribution and use in source and binary forms, with or without
12.9 +# modification, are permitted provided that the following conditions are met:
12.10 +#
12.11 +# * Redistributions of source code must retain the above copyright notice,
12.12 +# this list of conditions, and the following disclaimer.
12.13 +# * Redistributions in binary form must reproduce the above copyright notice,
12.14 +# this list of conditions, and the following disclaimer in the
12.15 +# documentation and/or other materials provided with the distribution.
12.16 +# * Neither the name of the author of this software nor the name of
12.17 +# contributors to this software may be used to endorse or promote products
12.18 +# derived from this software without specific prior written consent.
12.19 +#
12.20 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
12.21 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
12.22 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
12.23 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
12.24 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
12.25 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
12.26 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
12.27 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
12.28 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
12.29 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
12.30 +# POSSIBILITY OF SUCH DAMAGE.
12.31 +###
12.32 +
12.33 +from supybot.test import *
12.34 +
12.35 +class WeatherTestCase(PluginTestCase):
12.36 + plugins = ('Weather',)
12.37 + if network:
12.38 + def testHam(self):
12.39 + self.assertNotError('ham Columbus, OH')
12.40 + self.assertNotError('ham 43221')
12.41 + self.assertNotRegexp('ham Paris, FR', 'Virginia')
12.42 + self.assertError('ham alsdkfjasdl, asdlfkjsadlfkj')
12.43 + self.assertNotError('ham London, gb')
12.44 + self.assertNotError('ham London, GB')
12.45 + self.assertNotError('ham Munich, germany')
12.46 + self.assertNotError('ham Tucson, AZ')
12.47 + # "Multiple locations found" test
12.48 + self.assertNotError('ham sandwich')
12.49 +
12.50 + def testCnn(self):
12.51 + self.assertNotError('cnn Columbus, OH')
12.52 + self.assertNotError('cnn 43221')
12.53 + self.assertNotRegexp('cnn Paris, FR', 'Virginia')
12.54 + self.assertError('cnn alsdkfjasdl, asdlfkjsadlfkj')
12.55 + self.assertNotError('cnn London, uk')
12.56 + self.assertNotError('cnn London, UK')
12.57 + self.assertNotError('cnn Nurnberg, de')
12.58 + self.assertNotError('cnn Tucson, AZ')
12.59 +
12.60 + def testWunder(self):
12.61 + self.assertNotError('wunder Columbus, OH')
12.62 + self.assertNotError('wunder 43221')
12.63 + self.assertNotRegexp('wunder Paris, FR', 'Virginia')
12.64 + self.assertError('wunder alsdkfjasdl, asdlfkjsadlfkj')
12.65 + self.assertNotError('wunder London, england')
12.66 + self.assertNotError('wunder Munich, germany')
12.67 + self.assertNotError('wunder Tucson, AZ')
12.68 +
12.69 + def testTemperatureUnit(self):
12.70 + try:
12.71 + orig = conf.supybot.plugins.Weather.temperatureUnit()
12.72 + conf.supybot.plugins.Weather.temperatureUnit.setValue('F')
12.73 + self.assertRegexp('cnn Columbus, OH', r'is -?\d+\.\d[^F]*F')
12.74 + self.assertRegexp('ham Columbus, OH', r'is -?\d+\.\d[^F]*F')
12.75 + conf.supybot.plugins.Weather.temperatureUnit.setValue('C')
12.76 + self.assertRegexp('cnn Columbus, OH', r'is -?\d+\.\d[^C]*C')
12.77 + self.assertRegexp('ham Columbus, OH', r'is -?\d+\.\d[^C]*C')
12.78 + conf.supybot.plugins.Weather.temperatureUnit.setValue('K')
12.79 + self.assertRegexp('cnn Columbus, OH', r'is -?\d+\.\d K')
12.80 + self.assertRegexp('ham Columbus, OH', r'is -?\d+\.\d K')
12.81 + finally:
12.82 + conf.supybot.plugins.Weather.temperatureUnit.setValue(orig)
12.83 +
12.84 + def testNoEscapingWebError(self):
12.85 + self.assertNotRegexp('ham "buenos aires"', 'WebError')
12.86 +
12.87 + def testWeatherRepliesWithBogusLocation(self):
12.88 + self.assertRegexp('weather some place that doesn\'t exist', r'.')
12.89 +
12.90 + def testConvertConfig(self):
12.91 + try:
12.92 + convert = conf.supybot.plugins.Weather.convert()
12.93 + unit = conf.supybot.plugins.Weather.temperatureUnit()
12.94 + conf.supybot.plugins.Weather.convert.setValue(False)
12.95 + conf.supybot.plugins.Weather.temperatureUnit.setValue('C')
12.96 + self.assertRegexp('ham london, gb', r'-?\d+\.\d[^C]*C')
12.97 + self.assertRegexp('ham 02115', r'-?\d+\.\d[^F]*F')
12.98 + conf.supybot.plugins.Weather.temperatureUnit.setValue('F')
12.99 + self.assertRegexp('ham london, gb', r'-?\d+\.\d[^C]*C')
12.100 + self.assertRegexp('ham 02115', r'-?\d+\.\d[^F]*F')
12.101 + conf.supybot.plugins.Weather.convert.setValue(True)
12.102 + conf.supybot.plugins.Weather.temperatureUnit.setValue('C')
12.103 + self.assertRegexp('ham london, gb', r'-?\d+\.\d[^C]*C')
12.104 + self.assertRegexp('ham 02115', r'-?\d+\.\d[^C]*C')
12.105 + conf.supybot.plugins.Weather.temperatureUnit.setValue('F')
12.106 + self.assertRegexp('ham london, gb', r'-?\d+\.\d[^F]*F')
12.107 + self.assertRegexp('ham 02115', r'-?\d+\.\d[^F]*F')
12.108 + finally:
12.109 + conf.supybot.plugins.Weather.convert.setValue(convert)
12.110 + conf.supybot.plugins.Weather.temperatureUnit.setValue(unit)
12.111 +
12.112 +
12.113 +# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
13.1 --- a/trunk/quahog/plugins/Weather Thu Oct 22 10:12:03 2009 -0400
13.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
13.3 @@ -1,1 +0,0 @@
13.4 -/home/schultmc/.supybot/Supybot-Weather
13.5 \ No newline at end of file