Index: trunk/pywikipedia/pywikibot/textlib.py |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | |
18 | 18 | import wikipedia as pywikibot |
19 | 19 | import re |
| 20 | +from HTMLParser import HTMLParser |
20 | 21 | |
21 | 22 | def unescape(s): |
22 | 23 | """Replace escaped HTML-special characters by their originals""" |
— | — | @@ -219,6 +220,40 @@ |
220 | 221 | return toRemoveR.sub('', text) |
221 | 222 | |
222 | 223 | |
| 224 | +def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']): |
| 225 | + """ |
| 226 | + Return text without portions where HTML markup is disabled |
| 227 | + |
| 228 | + Parts that can/will be removed are -- |
| 229 | + * HTML and all wiki tags |
| 230 | + |
| 231 | + The exact set of parts which should NOT be removed can be passed as the |
| 232 | + 'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup']. |
| 233 | + """ |
| 234 | + # try to merge with 'removeDisabledParts()' above into one generic function |
| 235 | + |
| 236 | + # thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-class.html |
| 237 | + parser = _GetDataHTML() |
| 238 | + parser.keeptags = keeptags |
| 239 | + parser.feed(text) |
| 240 | + parser.close() |
| 241 | + return parser.textdata |
| 242 | + |
| 243 | +# thanks to http://docs.python.org/library/htmlparser.html |
| 244 | +class _GetDataHTML(HTMLParser): |
| 245 | + textdata = u'' |
| 246 | + keeptags = [] |
| 247 | + |
| 248 | + def handle_data(self, data): |
| 249 | + self.textdata += data |
| 250 | + |
| 251 | + def handle_starttag(self, tag, attrs): |
| 252 | + if tag in self.keeptags: self.textdata += u"<%s>" % tag |
| 253 | + |
| 254 | + def handle_endtag(self, tag): |
| 255 | + if tag in self.keeptags: self.textdata += u"</%s>" % tag |
| 256 | + |
| 257 | + |
223 | 258 | def isDisabled(text, index, tags = ['*']): |
224 | 259 | """ |
225 | 260 | Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. |